# Internet Ad Classification Practice

In [32]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import os
import scipy.stats as stat
from sklearn import preprocessing


In [33]:
df=pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/internet_ads/ad.data", low_memory=False,header=None)

In [34]:
df.columns=df.columns.astype('str')

In [35]:
df.rename(columns={'1558': 'ad'}, inplace=True)

In [36]:
df.iloc[:,0].describe()

In [6]:
# indentifying the missing values '?' in dataset

newdf=df.iloc[:,[0,1,2,3]]
newdf=newdf.applymap(lambda x: '?' in x)
plt.figure(figsize=(7,6))
sns.heatmap(newdf, cbar=False, yticklabels=False, cmap='viridis')

In [7]:
for i in (newdf):
    print('column['+str(i)+'] has missing values -'+str(sum(newdf[i])))

In [8]:
def replace_missing(df):
    for i in df:
        df[i] = df[i].replace('[?]', np.NAN, regex=True).astype('float') #go to column, replace the ? with nan and make it a number data type
        df[i] = df[i].fillna(df[i].mean()) # replace the na with the mean of that column
    return df

In [9]:
df.columns=df.columns.astype(str).str.lower()

In [10]:
df[['0','1','2','3']]=replace_missing(df[['0','1','2','3']].copy()).values

In [11]:
df[['3']]=df[['3']].apply(lambda x: round(x))

In [12]:
df[['0','1','2','3']].describe()

### Looking at distributions of the first three variables

In [13]:
df.rename(columns={'0': 'img_height', '1': 'img_wdith','2':'aspect_ratio'}, inplace=True)

In [14]:
fig, ax = plt.subplots(1,3)

fig.set_figheight(5)
fig.set_figwidth(13)

plt.subplot(1,3,1)
sns.distplot(pd.Series(df['img_height']))
plt.subplot(1,3,2)
sns.distplot(pd.Series(df['img_wdith']))
plt.subplot(1,3,3)
sns.distplot(pd.Series(df['aspect_ratio']));

- Positively skewed distributions

Examining relationships between the variables:

In [15]:
sns.pairplot(data=df.loc[:,['img_height','img_wdith','aspect_ratio']])

Looking at these specifically in context of ad and not ad

In [16]:
df.head()

In [17]:
# encoding ad. and nonad.
df.iloc[:,-1] = df.iloc[:,-1].replace(['ad.','nonad.'], [1,0])

In [18]:
df['ad'].mean() # only 14% of the observations are ads
# See below in modeling section for handling this

In [19]:
plt.subplot(1,2,1)
df.loc[df['ad'] == 1].img_wdith.plot.box()

plt.subplot(1,2,2)
df.loc[df['ad'] == 0].img_wdith.plot.box();

# Model Prep

In [20]:
x = df.iloc[:,:-1]
y=df.iloc[:,-1]

### Best practice: scaling the data to normal distribution using z scores

In [21]:
from sklearn.preprocessing import StandardScaler

scaled = StandardScaler()
x=scaled.fit_transform(x)

In [22]:
sns.pairplot(data=df.iloc[:,[0,1,2,-1]], hue='ad')

# Modeling yay!

In [23]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [24]:
x_train, x_test, y_train, y_test = train_test_split(np.array(x), np.array(y), test_size=0.33, random_state=42)

### Fixing the minority sample of ads artificially

In [25]:
from imblearn.over_sampling import SMOTE
os= SMOTE(random_state=42)

In [27]:
columns=x_train.columns
os_data_x,os_data_y=os.fit_sample(x_train, y_train)
os_data_X = pd.DataFrame(data=os_data_x,columns=columns )
os_data_y= pd.DataFrame(data=os_data_y,columns=['y'])

In [29]:
print("length of oversampled data is ",len(os_data_x))
print("Number of no subscription in oversampled data",len(os_data_y[os_data_y['y']==0]))
print("Number of subscription",len(os_data_y[os_data_y['y']==1]))
print("Proportion of no subscription data in oversampled data is ",len(os_data_y[os_data_y['y']==0])/len(os_data_x))
print("Proportion of subscription data in oversampled data is ",len(os_data_y[os_data_y['y']==1])/len(os_data_x))

In [30]:
len(os_data_x)

In [31]:
print('je;lo')