# General Linear Model Analysis of Net Gain

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from matplotlib import pyplot
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
ads = pd.read_csv("C:\\Users\\nicol\\TDI\\CodingChallenge\\advertisement_netgain.csv")
ads.head()

Unnamed: 0,id,realtionship_status,industry,genre,targeted_sex,average_runtime(minutes_per_week),airtime,airlocation,ratings,expensive,money_back_guarantee,netgain
0,19717,Married-spouse-absent,Auto,Comedy,Male,45,Primetime,United-States,0.027465,High,No,False
1,31593,Married-civ-spouse,Pharma,Comedy,Male,45,Primetime,United-States,0.027465,Low,No,False
2,5681,Divorced,Entertainment,Comedy,Female,45,Primetime,United-States,0.027465,High,Yes,False
3,15491,Separated,Political,Infomercial,Female,40,Primetime,United-States,0.027465,Low,No,False
4,23587,Married-civ-spouse,Pharma,Comedy,Male,48,Primetime,United-States,0.027465,High,No,True


In [2]:
ads.isnull().sum()

id                                   0
realtionship_status                  0
industry                             0
genre                                0
targeted_sex                         0
average_runtime(minutes_per_week)    0
airtime                              0
airlocation                          0
ratings                              0
expensive                            0
money_back_guarantee                 0
netgain                              0
dtype: int64

In [3]:
labelencoder = LabelEncoder()

In [4]:
#defining a preprocessing function to convert key features to continuous data types
def preprocessing (ads):
    ads.drop(axis = 1, columns = ['id'], inplace = True)
    #correcting misspelling
    ads.rename(columns = {'realtionship_status':'relationship_status'}, inplace = True)
    
    expense_dict = {'Low':0, 'Medium':1, 'High':2}
    ads['expensive'] = ads['expensive'].map(expense_dict, 'ignore')
    
    airtime_dict = {'Primetime':0, 'Morning':1, 'Daytime':2}
    ads['airtime'] = ads['airtime'].map(airtime_dict, 'ignore')
    
    genre_dict = {'Comedy':0, 'Infomercial': 1, 'Drama': 2, 'Direct': 3, 'Other':4}
    ads['genre'] = ads['genre'].map(genre_dict, 'ignore')
    
    industry_dict = {'Auto':0, 'Pharma':1, 'Entertainment':2, 'Political':3, 'ClassAction':4,
                     'Other':5}
    ads['industry'] = ads['industry'].map(industry_dict, 'ignore')
    
    ads['airlocation'] = ads['airlocation'].apply(lambda x: '1' if 'United-States' in x else '0')
    #ds['airlocation'] = ads['airlocation'].astype(int)
    
    #combining categories from 7 to 4
    relationship_dict = {'Married-spouse-absent':'Separated', 'Divorced': 'Separated',
                         'Separated': 'Separated', 'Married-civ-spouse': 'Married',
                         'Married-AF-spouse': 'Married', 'Never-married':'Single',
                         'Widowed':'Widowed'}                     
    ads['relationship_status'] = ads['relationship_status'].map(relationship_dict, 'ignore')
    
    encoding_dict = {'Separated':0, 'Married':1, 'Single':2, 'Widowed':3}
    ads['relationship_status'] = ads['relationship_status'].map(encoding_dict, 'ignore')
    
    #Label Encoding dichotomous variables to int type
    #1:male, 0:female
    ads['targeted_sex']= labelencoder.fit_transform(ads['targeted_sex'])
    #1:yes, 0:no
    ads['money_back_guarantee']= labelencoder.fit_transform(ads['money_back_guarantee'])
    
    ads['ratings'] = ads['ratings'].astype(int)
      
    return ads

new_ads = preprocessing(ads)
new_ads.head()

Unnamed: 0,relationship_status,industry,genre,targeted_sex,average_runtime(minutes_per_week),airtime,airlocation,ratings,expensive,money_back_guarantee,netgain
0,0,0,0,1,45,0,1,0,2,0,False
1,1,1,0,1,45,0,1,0,0,0,False
2,0,2,0,0,45,0,1,0,2,1,False
3,0,3,1,0,40,0,1,0,0,0,False
4,1,1,0,1,48,0,1,0,2,0,True


In [5]:
#normalizing data for ease of comparison
scaler = MinMaxScaler() 
scaled_values = scaler.fit_transform(new_ads) 
new_ads.loc[:,:] = scaled_values
new_ads.head()

Unnamed: 0,relationship_status,industry,genre,targeted_sex,average_runtime(minutes_per_week),airtime,airlocation,ratings,expensive,money_back_guarantee,netgain
0,0.0,0.0,0.0,1.0,0.44898,0.0,1.0,0.0,1.0,0.0,0.0
1,0.333333,0.2,0.0,1.0,0.44898,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.4,0.0,0.0,0.44898,0.0,1.0,0.0,1.0,1.0,0.0
3,0.0,0.6,0.25,0.0,0.397959,0.0,1.0,0.0,0.0,0.0,0.0
4,0.333333,0.2,0.0,1.0,0.479592,0.0,1.0,0.0,1.0,0.0,1.0


In [6]:
new_ads.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26048 entries, 0 to 26047
Data columns (total 11 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   relationship_status                26048 non-null  float64
 1   industry                           26048 non-null  float64
 2   genre                              26048 non-null  float64
 3   targeted_sex                       26048 non-null  float64
 4   average_runtime(minutes_per_week)  26048 non-null  float64
 5   airtime                            26048 non-null  float64
 6   airlocation                        26048 non-null  float64
 7   ratings                            26048 non-null  float64
 8   expensive                          26048 non-null  float64
 9   money_back_guarantee               26048 non-null  float64
 10  netgain                            26048 non-null  float64
dtypes: float64(11)
memory usage: 2.2 MB


Generating train and test data

In [7]:
response = new_ads['netgain']
features = new_ads.iloc[:,0:10]

In [8]:
my_result_list = train_test_split(features, response, test_size=0.20, random_state=0)
features_train, features_test, response_train, response_test = my_result_list

# Comparison of several classification models

In [9]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

In [10]:
classifier = DecisionTreeClassifier(random_state = 0) #defining the classifier
classifier.fit(features_train, response_train)#fitting the decision tree model to our data

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=0, splitter='best')

In [12]:
#using the defined classifier to generate predictions
response_pred = classifier.predict(features_test)

In [13]:
models_list = [RandomForestClassifier(),
               DecisionTreeClassifier(),
               XGBClassifier()]
model_names = ['Random Forest', 
               'Decision Tree',
               'Extreme Gradient Boosting']

accuracy_list = []
results_dict = {}

for model in range(len(models_list)):
    classifier = models_list[model]
    classifier.fit(features_train, response_train)
    new_response_pred = classifier.predict(features_test)
    accuracy_list.append(accuracy_score(response_pred,response_test))
    
results_dict = {'Model Name': model_names, 'Accuracy': accuracy_list}
results_df = pd.DataFrame(results_dict)
results_df.head()

Unnamed: 0,Model Name,Accuracy
0,Random Forest,0.776008
1,Decision Tree,0.776008
2,Extreme Gradient Boosting,0.776008


Initial models produce average accuracy at approximately 78%. My project will further analyze feature importance and determine what advertising characteristics best predict net gain. 