# 2.0 Feature Selection, Engineering and Analysis

In [1]:
import pandas as pd
import numpy as np
import json
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import roc_curve, auc, roc_auc_score

In [2]:
# import cleaned data and make it into a pandas Data Frame
with open('json_/cleaned_data_1.json') as json_file:
    data = json.load(json_file)

In [3]:
df = pd.read_json(data)

#### Ensure Data Was Imported Correctly

In [4]:
df.shape

(9121, 25)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9121 entries, 0 to 999
Data columns (total 25 columns):
backers_count               9121 non-null int64
converted_pledged_amount    9121 non-null int64
country                     9121 non-null object
created_at                  9121 non-null datetime64[ns]
currency                    9121 non-null object
current_currency            9121 non-null object
deadline                    9121 non-null int64
disable_communication       9121 non-null bool
fx_rate                     9121 non-null float64
goal                        9121 non-null float64
id                          9121 non-null int64
is_starrable                9121 non-null bool
launched_at                 9121 non-null datetime64[ns]
name                        9121 non-null object
pledged                     9121 non-null float64
spotlight                   9121 non-null bool
staff_pick                  9121 non-null bool
state                       9121 non-null object
state

In [6]:
df.head(1)

Unnamed: 0,backers_count,converted_pledged_amount,country,created_at,currency,current_currency,deadline,disable_communication,fx_rate,goal,...,spotlight,staff_pick,state,state_changed_at,static_usd_rate,urls,usd_pledged,cat_name,cat_slug,loc_state
0,6,1847,GB,2015-04-03,GBP,USD,1430956800000,False,1.30399,7000.0,...,False,False,canceled,2015-04-19,1.491538,"{""web"":{""project"":""https://www.kickstarter.com...",1842.049134,Ready-to-wear,fashion/ready-to-wear,England


In [7]:
# deadline's date appears to have changed format, as such it is returned to desired format
df['deadline'] = pd.to_datetime(df['deadline'], unit='ms')
df['deadline'] = pd.to_datetime(df['deadline'].dt.date)

In [8]:
# Get names of indexes with states of canceled, live and suspended and remove them as they are
# not an indication of success or failure. Cancellation and suspension could be for various reasons other
# than failure.
indexNames = df[(df['state'] == 'canceled')|(df['state'] == 'live')|(df['state'] == 'suspended')].index
# Delete these row indexes from dataFrame
df.drop(indexNames,0,inplace=True)

In [9]:
# Check data target balance
df['state'].value_counts()

successful    5174
failed        3171
Name: state, dtype: int64

In [10]:
# I would like to convert countries into continents 
df['country'].value_counts();

In [11]:
for index, row in df.iterrows():
    if df.loc[index,'country'] == 'US' or df.loc[index,'country'] == 'CA':
        df.loc[index,'country'] = 'NAmerica'
    elif df.loc[index,'country'] == 'NZ' or df.loc[index,'country'] == 'AU':
        df.loc[index,'country'] = 'Aus'
    elif df.loc[index,'country'] == 'JP' or df.loc[index,'country'] == 'CH' or df.loc[index,'country'] == 'HK' or df.loc[index,'country'] == 'SG':
        df.loc[index,'country'] = 'Aisa'
    elif df.loc[index,'country'] == 'MX':
        df.loc[index,'country'] = 'SAmerica'
    else:
        df.loc[index,'country'] = 'Euro'

In [12]:
df.reset_index(drop=True, inplace=True)

#### Create new features

In [13]:
# time_allowed feature is the time between creating the kickstarter project and ending it in days
df['time_allowed'] = df['state_changed_at']-df['created_at']

In [14]:
# which is then converted into an integer
df.time_allowed = df.time_allowed.dt.days

In [15]:
# ratio of funding pledged/goal is used to normalise the date and replace 'goal' and 'pledge' amount
df['pledge/goal'] = (df['converted_pledged_amount']/df['goal'])

In [16]:
# another ration of goal and time_allowed is attempted
df['goal/time_allowed'] = (df['goal']/df['time_allowed'])

In [17]:
# Below I am inspecting the data to gauge presence of outliers and understand what is the range I am looking at

In [18]:
df.goal.describe();

In [19]:
df.time_allowed.describe();

In [20]:
df['pledge/goal'].describe();

In [21]:
df['pledge/goal'].value_counts();

In [26]:
outliers_index = list((df[df['pledge/goal'] > 4.5].index)|((df[df['time_allowed'] >= 2000].index)))

In [27]:
len(outliers_index)

41

In [28]:
df.drop(outliers_index,0,inplace=True)

In [29]:
df.reset_index(drop=True, inplace=True)

In [30]:
df.shape

(8304, 28)

In [31]:
df['pledge/goal'].describe()

count     8304.000000
mean         3.960467
std        142.233836
min          0.000000
25%          0.039493
50%          1.028000
75%          1.375000
max      12575.000000
Name: pledge/goal, dtype: float64

In [35]:
plt.hist(df['pledge/goal'], bins=10)


In [36]:
# binning pledge/goal ration into bins
bins = [-0.01, 0.5, 1.0 , 1.5, 2.0, 2.5, 3.0, 5.0]
bins_pledgeGoal = pd.cut(df['pledge/goal'], bins)
bins_pledgeGoal = bins_pledgeGoal.cat.as_ordered()
df["pledge/goal"]=bins_pledgeGoal
df["pledge/goal"];

In [37]:
df.isna().sum()

In [None]:
# Pick contineous features

In [None]:
X_c = df[['time_allowed','goal', 'backers_count']]

In [None]:
# X_c.to_csv(r'csv_\X_c.csv')

In [None]:
# Pick categotical features

In [None]:
X_d = pd.get_dummies(df[['cat_slug', 'staff_pick', 'pledge/goal']], drop_first=True, prefix_sep='_')

In [None]:
X_d.columns

In [None]:
# rename 'pledge/goal_...' as they pose issues when running models

In [None]:
X_d.rename(columns={'pledge/goal_(0.5, 1.0]': 'p/g_low','pledge/goal_(1.0, 1.5]': 'p/g_high', 'pledge/goal_(1.5, 2.0]': 'p/g_vhigh',  'pledge/goal_(2.0, 2.5]': 'p/g_shigh', 'pledge/goal_(2.0, 2.5]':'p/g_sdhigh', 'pledge/goal_(2.5, 3.0]': 'p/g_uhigh', 'pledge/goal_(3.0, 5.0]': 'p/g_extraOrdinarlyHigh' }, inplace=True)

In [None]:
# X_d.to_csv(r'csv_\X_d.csv')

In [None]:
# Join X_c and X_d Features
X = pd.concat([X_c,X_d],1)
X;

In [None]:
# Set target to binary
y = pd.get_dummies(df[['state']], drop_first=True)
y;

In [None]:
# y.to_csv(r'csv_\y.csv')

In [None]:
X.isna().sum().sum()

In [None]:
# Plot pair relationships
sns.pairplot(pd.concat([X_c,y],1), diag_kind="kde", height=2.5)

In [None]:
(pd.concat([X_c,y],1)).corr()

# 3.0 Modelling

The following models will be used:
   - Logistic regression
   - Random forest
   - XGBoost

### 3.1 Split, Train, Test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, shuffle=True)

### 3.2 Random Forest

In [None]:
forest_vanilla = RandomForestClassifier()
forest_vanilla.fit(X_train, y_train)

In [None]:
y_hat_train_v_rf = forest_vanilla.predict(X_train)
y_hat_test_v_rf = forest_vanilla.predict(X_test)
accuracy_score(y_train, y_hat_train_v_rf), accuracy_score(y_test, y_hat_test_v_rf)

In [None]:
print(confusion_matrix(y_train, y_hat_train_v_rf))

In [None]:
print(confusion_matrix(y_test, y_hat_test_v_rf))

In [None]:
# hyperparameter tuning 

In [None]:
# varying number of max_depth parameter and setting criterion='entropy'
array32 = list(range(1,50))
clf_list = []
for i in array32:
    clf_choc = RandomForestClassifier(criterion='entropy', max_depth=i);  # Train the classifier using training data 
    clf_list.append(clf_choc.fit(X_train, y_train))
auc_list_train = []
for i in clf_list:
    y_pred = i.predict(X_train)
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train, y_pred)
    auc_list_train.append(auc(false_positive_rate, true_positive_rate))
auc_list_test = []
for i in clf_list:
    y_pred1 = i.predict(X_test)
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred1)
    auc_list_test.append(auc(false_positive_rate, true_positive_rate))

In [None]:
plt.figure(figsize=(12,6))
plt.plot(array32, auc_list_test, 'b', label='Test AUC')
plt.plot(array32, auc_list_train, 'r', label='Train AUC')
plt.ylabel('AUC score')
plt.xlabel('Tree depth')
plt.legend()
plt.show()

In [None]:
# varying number of min_samples_split parameter and setting criterion='entropy'
array1 = list(map(lambda x: round(x,1), np.arange(0.1,1.0,0.1)))
clf_list_strawberry = []
for i in array1:
    clf_strawberry = RandomForestClassifier(criterion='entropy', min_samples_split = i)  # Train the classifier using training data 
    clf_list_strawberry.append(clf_strawberry.fit(X_train, y_train))
auc_list_test_strawberry = []
for i in clf_list_strawberry:
    y_pred = i.predict(X_test)
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
    auc_list_test_strawberry.append(auc(false_positive_rate, true_positive_rate))
auc_list_train_strawberry = []
for i in clf_list_strawberry:
    y_pred1 = i.predict(X_train)
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train, y_pred1)
    auc_list_train_strawberry.append(auc(false_positive_rate, true_positive_rate))

In [None]:
plt.figure(figsize=(12,6))
plt.plot(array1, auc_list_test_strawberry, 'b', label='Test AUC')
plt.plot(array1, auc_list_train_strawberry, 'r', label='Train AUC')
plt.ylabel('AUC score')
plt.xlabel('Tree min_samples_split')
plt.legend()
plt.show()

In [None]:
# varying number of min_samples_leaf parameter and setting criterion='entropy'
array05 = list(map(lambda x: round(x,1), np.arange(0.1,0.5,0.1)))
clf_list_cookiencream = []
for i in array05:
    clf_cookiencream = RandomForestClassifier(criterion='entropy', min_samples_leaf = i)  # Train the classifier using training data 
    clf_list_cookiencream.append(clf_cookiencream.fit(X_train, y_train))
auc_list_test_cookiencream = []
for i in clf_list_cookiencream:
    y_pred = i.predict(X_test)
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
    auc_list_test_cookiencream.append(auc(false_positive_rate, true_positive_rate))
auc_list_train_cookiencream = []
for i in clf_list_cookiencream:
    y_pred1 = i.predict(X_train)
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train, y_pred1)
    auc_list_train_cookiencream.append(auc(false_positive_rate, true_positive_rate))

In [None]:
plt.figure(figsize=(12,6))
plt.plot(array05, auc_list_test_cookiencream, 'b', label='Test AUC')
plt.plot(array05, auc_list_train_cookiencream, 'r', label='Train AUC')
plt.ylabel('AUC score')
plt.xlabel('Tree min_samples_leaf')
plt.legend()
plt.show()

In [None]:
# varying number of max_features parameter and setting criterion='entropy'
array200 = range(1,len(X.columns))
clf_list_peanutbutter = []
for i in array200:
    clf_peanutbutter = RandomForestClassifier(criterion='entropy', max_features = i)  # Train the classifier using training data 
    clf_list_peanutbutter.append(clf_peanutbutter.fit(X_train, y_train))
auc_list_test_peanutbutter = []
for i in clf_list_peanutbutter:
    y_pred = i.predict(X_test)
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
    auc_list_test_peanutbutter.append(auc(false_positive_rate, true_positive_rate))
auc_list_train_peanutbutter = []
for i in clf_list_peanutbutter:
    y_pred1 = i.predict(X_train)
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train, y_pred1)
    auc_list_train_peanutbutter.append(auc(false_positive_rate, true_positive_rate))

In [None]:
plt.figure(figsize=(12,6))
plt.plot(array200, auc_list_test_peanutbutter, 'b', label='Test AUC')
plt.plot(array200, auc_list_train_peanutbutter, 'r', label='Train AUC')
plt.ylabel('AUC score')
plt.xlabel('Tree max_features')
plt.legend()
plt.show()

In [None]:

clf_rf = RandomForestClassifier(criterion='entropy', max_depth=12, min_samples_split =0.2, max_features = 22)
clf_rf.fit(X_train, y_train)
y_hat_rf_train = clf_rf.predict(X_train)
y_hat_rf_test = clf_rf.predict(X_test)
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_hat_rf_test)
auc(false_positive_rate, true_positive_rate)

In [None]:
accuracy_score(y_train, y_hat_rf_train), accuracy_score(y_test, y_hat_rf_test)

In [None]:
# test data achieves same accuracy as Vanilla
print(confusion_matrix(y_test, y_hat_test_v_rf))

### 3.3 Logistic Regression

In [None]:
logreg_vanilla = LogisticRegression()
model_log = logreg_vanilla.fit(X_train, y_train)

In [None]:
y_score_lr_v = logreg_vanilla.decision_function(X_test)
y_hat_test_lr_v = logreg_vanilla.predict(X_test)
y_hat_train_lr_v = logreg_vanilla.predict(X_train)
fpr, tpr, thresholds = roc_curve(y_test, y_score_lr_v)

In [None]:
print('AUC: {}'.format(auc(fpr, tpr)))
print(confusion_matrix(y_train, y_hat_train_lr_v))
print(confusion_matrix(y_test, y_hat_test_lr_v))

In [None]:
# accuracy enhanced slightly when compared to RandomForest
accuracy_score(y_test, y_hat_test_lr_v)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#Seaborns Beautiful Styling
sns.set_style("darkgrid", {"axes.facecolor": ".9"})

print('AUC: {}'.format(auc(fpr, tpr)))
plt.figure(figsize=(10,8))
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve')
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.yticks([i/20.0 for i in range(21)])
plt.xticks([i/20.0 for i in range(21)])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

In [None]:
# hyperparameter tuing

In [None]:
# Change C parameter
C_param_range = [0.001,0.01,0.1,1,10,100]
clf_list = []
for i in C_param_range:
    clf_choc = LogisticRegression(C=i);  # Train the classifier using training data 
    clf_list.append(clf_choc.fit(X_train, y_train))
auc_list_train_lr_choc = []
for i in clf_list:
    y_pred = i.predict(X_train)
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train, y_pred)
    auc_list_train_lr_choc.append(auc(false_positive_rate, true_positive_rate))
auc_list_test_lr_choc = []
for i in clf_list:
    y_pred1 = i.predict(X_test)
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred1)
    auc_list_test_lr_choc.append(auc(false_positive_rate, true_positive_rate))

In [None]:
plt.figure(figsize=(12,6))
plt.plot(C_param_range, auc_list_test_lr_choc, 'b', label='Test AUC')
plt.plot(C_param_range, auc_list_train_lr_choc, 'r', label='Train AUC')
plt.ylabel('AUC score')
plt.xlabel('C Parameter')
plt.legend()
plt.show()

In [None]:
# Change penlty and corresponding solver type
penalty_list = {'l1':'liblinear', 'l2':'liblinear', 'none':'lbfgs'}
clf_list_strawberry = []
for k,v in penalty_list.items():
    print(k,v)
    clf_strawberry = LogisticRegression(penalty = k,solver=v)  # Train the classifier using training data 
    clf_list_strawberry.append(clf_strawberry.fit(X_train, y_train))
auc_list_test_strawberry_lr = []
for i in clf_list_strawberry:
    y_pred = i.predict(X_test)
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
    auc_list_test_strawberry_lr.append(auc(false_positive_rate, true_positive_rate))
auc_list_train_strawberry_lr = []
for i in clf_list_strawberry:
    y_pred1 = i.predict(X_train)
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train, y_pred1)
    auc_list_train_strawberry_lr.append(auc(false_positive_rate, true_positive_rate))

In [None]:
auc_list_test_strawberry_lr, auc_list_train_strawberry_lr

In [None]:
# set width of bar
plt.figure(figsize=(12,6))
barWidth = 0.25
 
# set height of bar
bars1 = auc_list_test_strawberry_lr
bars2 = auc_list_train_strawberry_lr
 
# Set position of bar on X axis
r1 = np.arange(len(bars1))
r2 = [x + barWidth for x in r1]
 
# Make the plot
plt.bar(r1, bars1, width=barWidth, edgecolor='white', label='var1')
plt.bar(r2, bars2, color='#2d7f5e', width=barWidth, edgecolor='white', label='var2')
 
# Add xticks on the middle of the group bars
plt.xlabel('group', fontweight='bold')
plt.xticks([r + barWidth for r in range(len(bars1))], list(penalty_list.keys()))
 
# Create legend & Show graphic
plt.legend()
plt.show()

In [None]:
# Little difference - keep as Vanilla

### 3.4 XGBoost

In [None]:
XGBoost = xgb.XGBClassifier()
XGBoost.fit(X_train, y_train)
y_hat_train = XGBoost.predict(X_train)
y_hat_test = XGBoost.predict(X_test)
training_accuracy = accuracy_score(y_train, y_hat_train)
val_accuracy = accuracy_score(y_test, y_hat_test)

print("")
print("Training Accuracy: {:.4}%".format(training_accuracy * 100))
print("Validation accuracy: {:.4}%".format(val_accuracy * 100))

In [None]:
param_grid = {
    "learning_rate": [0.1],
    'max_depth': [6],
    'min_child_weight': [10],
    'subsample': [ 0.7],
    'n_estimators': [5, 30, 100, 250],
}

In [None]:
grid_clf = GridSearchCV(XGBoost, param_grid, scoring='accuracy', cv=None, n_jobs=1)
grid_clf.fit(X, y)

best_parameters = grid_clf.best_params_

print("Grid Search found the following optimal parameters: ")
for param_name in sorted(best_parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))

y_hat_train = grid_clf.predict(X_train)
y_hat_test = grid_clf.predict(X_test)
training_accuracy = accuracy_score(y_train, y_hat_train)
val_accuracy = accuracy_score(y_test, y_hat_test)

print("")
print("Training Accuracy: {:.4}%".format(training_accuracy * 100))
print("Validation accuracy: {:.4}%".format(val_accuracy * 100))

In [None]:
# Grid Search found the following optimal parameters: 
# learning_rate: 0.1
# max_depth: 6
# min_child_weight: 10
# n_estimators: 30
# subsample: 0.7

# Training Accuracy: 96.55%
# Validation accuracy: 96.12%

# Conclusion

All work very well and pretty much predict with the same accuracy. Next I will try PCA for feature selection.

In [None]:
# inspect and explain
# label figures 