## Comparison of Dimensionality Reduction Techniques on the Enhanced Ads Dataset

In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
import time
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
adData = pd.read_csv('ad_data.csv', sep=',', header=None, error_bad_lines=False)
adData.head()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1549,1550,1551,1552,1553,1554,1555,1556,1557,1558
0,125,125,1.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ad.
1,57,468,8.2105,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ad.
2,33,230,6.9696,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ad.
3,60,468,7.8,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ad.
4,60,468,7.8,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ad.


The pd.read_csv() function's arguments are the filename as a string and the limit separator of a CSV file, which is ",". Please note that as there are no headers for the dataset. We specifically mention this using the header = None command. The last argument, **error_bad_lines=False**, is to skip any errors in the format of the file and then load data.

In [3]:
# dataset shape
print(adData.shape)

(3279, 1559)


In [4]:
# summarizing the statistics of the numerical raw data
adData.describe()

Unnamed: 0,4,5,6,7,8,9,10,11,12,13,...,1548,1549,1550,1551,1552,1553,1554,1555,1556,1557
count,3279.0,3279.0,3279.0,3279.0,3279.0,3279.0,3279.0,3279.0,3279.0,3279.0,...,3279.0,3279.0,3279.0,3279.0,3279.0,3279.0,3279.0,3279.0,3279.0,3279.0
mean,0.00427,0.011589,0.004575,0.003355,0.003965,0.011589,0.003355,0.00488,0.009149,0.004575,...,0.006099,0.004575,0.00366,0.00244,0.00305,0.006404,0.012809,0.013419,0.009759,0.001525
std,0.065212,0.107042,0.067491,0.057831,0.06285,0.107042,0.057831,0.069694,0.095227,0.067491,...,0.077872,0.067491,0.060393,0.049341,0.055148,0.079783,0.112466,0.115077,0.09832,0.039026
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


As we saw from the shape of the data, the dataset has 3279 examples with 1559 variables. The variable set has both categorical and numerical variables. The summary statistics are only derived for numerical data.

In [5]:
# separate dependent and independent variables
# preparing X variables
X = adData.loc[:,0:1557] # got the features from output above
print(X.shape)

# preparing y variable
y = adData[1558]
print(y.shape)

(3279, 1558)
(3279,)


In [6]:
# head of independent variables
X.head(15)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1548,1549,1550,1551,1552,1553,1554,1555,1556,1557
0,125,125,1.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,57,468,8.2105,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,33,230,6.9696,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,60,468,7.8,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,60,468,7.8,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,60,468,7.8,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,59,460,7.7966,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,60,234,3.9,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,60,468,7.8,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,60,468,7.8,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


From the output, we can see that there are many missing values in the dataset, which are represented by **?**. For further analysis, we have to remove these special characters and then replace those cells with assumed values. One popular method of replacing special characters is to impute the mean of the respective feature. Let's adopt this strategy. However, before doing that, let's look at the data types for this dataset to adopt a suitable replacement strategy.

In [7]:
# printing data types of dataset
print(X.dtypes)

0       object
1       object
2       object
3       object
4        int64
         ...  
1553     int64
1554     int64
1555     int64
1556     int64
1557     int64
Length: 1558, dtype: object


In [8]:
# replacing special characters with NaN values for the first 3 columns which are of type object
for i in range(0,3):
    X[i] = X[i].str.replace('?', 'nan').values.astype(float)
print(X.head(15))

     0      1       2    3     4     5     6     7     8     9     ...  1548  \
0   125.0  125.0  1.0000    1     0     0     0     0     0     0  ...     0   
1    57.0  468.0  8.2105    1     0     0     0     0     0     0  ...     0   
2    33.0  230.0  6.9696    1     0     0     0     0     0     0  ...     0   
3    60.0  468.0  7.8000    1     0     0     0     0     0     0  ...     0   
4    60.0  468.0  7.8000    1     0     0     0     0     0     0  ...     0   
5    60.0  468.0  7.8000    1     0     0     0     0     0     0  ...     0   
6    59.0  460.0  7.7966    1     0     0     0     0     0     0  ...     0   
7    60.0  234.0  3.9000    1     0     0     0     0     0     0  ...     0   
8    60.0  468.0  7.8000    1     0     0     0     0     0     0  ...     0   
9    60.0  468.0  7.8000    1     0     0     0     0     0     0  ...     0   
10    NaN    NaN     NaN    1     0     0     0     0     0     0  ...     0   
11   90.0   52.0  0.5777    1     0     

To replace the first three columns, we loop through the columns using the **for() loop** and also using the **range()** function. Since the first three columns are of the **object** or **string type**, we use the **.str.replace()** function, which stands for "string replace". After replacing the special characters, **?**, of the data with nan, we convert the data type to **float** with the **.values.astype(float)** function, which is required for further processing. By printing the first 15 examples, we can see that all special characters have been replaced with **nan** or **NaN** values

In [9]:
# replacing special characters in the remaining columns which are of type integer
for i in range(3, 1557):
    X[i] = X[i].replace('?', 'NaN').values.astype(float)

Now that we have replaced special characters in the data with NaN values, we can use the fillna() function in pandas to replace the NaN values with the mean of the column.

In [10]:
# impute the 'NaN' with the mean of the values
for i in range(0,1557):
    X[i] = X[i].fillna(X[i].mean())
print(X.head(15))

          0           1         2     3     4     5     6     7     8     \
0   125.000000  125.000000  1.000000   1.0   0.0   0.0   0.0   0.0   0.0   
1    57.000000  468.000000  8.210500   1.0   0.0   0.0   0.0   0.0   0.0   
2    33.000000  230.000000  6.969600   1.0   0.0   0.0   0.0   0.0   0.0   
3    60.000000  468.000000  7.800000   1.0   0.0   0.0   0.0   0.0   0.0   
4    60.000000  468.000000  7.800000   1.0   0.0   0.0   0.0   0.0   0.0   
5    60.000000  468.000000  7.800000   1.0   0.0   0.0   0.0   0.0   0.0   
6    59.000000  460.000000  7.796600   1.0   0.0   0.0   0.0   0.0   0.0   
7    60.000000  234.000000  3.900000   1.0   0.0   0.0   0.0   0.0   0.0   
8    60.000000  468.000000  7.800000   1.0   0.0   0.0   0.0   0.0   0.0   
9    60.000000  468.000000  7.800000   1.0   0.0   0.0   0.0   0.0   0.0   
10   64.021886  155.344828  3.911953   1.0   0.0   0.0   0.0   0.0   0.0   
11   90.000000   52.000000  0.577700   1.0   0.0   0.0   0.0   0.0   0.0   
12   90.0000

In [11]:
# scale data using MinMaxScaler; scaling data is useful in the modeling step
from sklearn.preprocessing import MinMaxScaler
minmaxScaler = MinMaxScaler()

# transforming with the scaler function
X_tran = pd.DataFrame(minmaxScaler.fit_transform(X))
print(X_tran.head())

       0         1         2     3     4     5     6     7     8     9     \
0  0.194053  0.194053  0.016642   1.0   0.0   0.0   0.0   0.0   0.0   0.0   
1  0.087637  0.730829  0.136820   1.0   0.0   0.0   0.0   0.0   0.0   0.0   
2  0.050078  0.358372  0.116138   1.0   0.0   0.0   0.0   0.0   0.0   0.0   
3  0.092332  0.730829  0.129978   1.0   0.0   0.0   0.0   0.0   0.0   0.0   
4  0.092332  0.730829  0.129978   1.0   0.0   0.0   0.0   0.0   0.0   0.0   

   ...  1548  1549  1550  1551  1552  1553  1554  1555  1556  1557  
0  ...   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
1  ...   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
2  ...   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
3  ...   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
4  ...   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  

[5 rows x 1558 columns]


In [12]:
# creating high dimensional dataset by factor 2
X_hd = pd.DataFrame(np.tile(X_tran, (1,2)))
X_hd.shape

(3279, 3116)

In [13]:
# create random samples from a normal distribution with mean=0 and standard-deviation=0.1
# Make the new dataset with the same shape as high dimensional dataframe created in the step above.

# defining the mean and standard deviation
mu, sigma = 0, 0.1

# generating random sample
noise = np.random.normal(mu, sigma, [3279,3116])
noise.shape

(3279, 3116)

In [16]:
# creating a new data set by adding sampled data frame
df_new = X_hd + noise
df_new

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3106,3107,3108,3109,3110,3111,3112,3113,3114,3115
0,0.102732,0.022539,0.074652,0.928914,0.235244,-0.236777,-0.005354,0.036706,0.035756,0.057531,...,0.025597,0.130177,0.100247,0.024164,-0.060928,0.025707,-0.146917,-0.057479,-0.025298,-0.074395
1,-0.018388,0.645097,0.279225,0.796055,-0.108358,-0.006007,-0.017139,-0.100841,0.010960,0.082591,...,0.147446,-0.039664,0.048309,-0.118841,0.046637,0.203041,-0.121009,-0.067697,-0.044437,0.130010
2,-0.118429,0.293342,-0.168401,1.178704,0.116382,-0.145719,0.018960,0.038215,0.091100,-0.091120,...,0.286778,-0.013108,-0.019836,0.221494,-0.115564,0.054080,-0.040792,0.120077,-0.114470,-0.089175
3,0.167291,0.724120,0.022938,0.991653,0.010992,-0.004901,-0.174410,0.040267,-0.117135,0.084591,...,-0.007333,-0.055101,0.149898,0.076970,-0.063795,-0.041484,-0.078621,-0.009983,-0.008430,-0.003738
4,0.177322,0.651475,-0.008781,1.011824,0.185948,-0.143975,0.108694,0.074283,-0.013574,-0.199283,...,-0.005234,0.081794,0.093410,-0.025354,-0.164223,-0.011667,0.020925,-0.125587,0.080512,0.183117
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3274,0.257379,0.069027,-0.002441,0.095223,0.184011,-0.021662,0.079660,-0.106242,0.091453,-0.037667,...,-0.076646,0.079244,-0.070320,-0.057649,0.160676,-0.167600,0.106329,-0.127919,-0.140064,0.109484
3275,0.198591,0.259914,0.056171,0.981976,0.190742,0.132772,-0.183514,0.067300,-0.041736,-0.014906,...,-0.060193,0.077089,-0.082254,0.040124,0.013007,0.057456,0.065024,-0.023069,-0.082469,0.040026
3276,0.061017,0.051765,0.130609,1.170746,-0.058320,0.025000,-0.003938,-0.007953,-0.027228,-0.151740,...,0.037147,0.059367,-0.124920,0.107478,-0.070632,0.008559,0.109661,0.137609,0.109085,0.057354
3277,0.088011,0.328971,-0.228082,0.933046,-0.016497,0.084500,-0.012877,0.220831,-0.023593,-0.024648,...,0.068173,0.119861,0.040730,-0.095574,-0.101827,0.008816,0.017339,0.110420,0.101506,0.004300


In [17]:
# split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df_new, y, test_size=0.3, random_state=123)

#### Backward Elminination (Recursive Elimination)