In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import re

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import BaggingClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb

In [None]:
df = pd.read_csv('/content/drive/MyDrive/TSW-hackathon-1/train.csv')
df.head()

Unnamed: 0,customer_id,destination,passanger,weather,temperature,time,coupon,expiration,gender,age,maritalStatus,has_children,education,occupation,income,car,Bar,CoffeeHouse,CarryAway,RestaurantLessThan20,Restaurant20To50,toCoupon_GEQ5min,toCoupon_GEQ15min,toCoupon_GEQ25min,direction_same,direction_opp,Y
0,258868,No Urgent Place,Friend(s),Sunny,80,6PM,Restaurant(<20),1d,Male,21,Unmarried partner,0,Some college - no degree,Student,Less than $12500,,less1,1~3,4~8,4~8,never,1,1,0,0,1,1
1,318369,Work,Alone,Sunny,80,7AM,Restaurant(<20),2h,Male,21,Single,0,Bachelors degree,Student,$37500 - $49999,,less1,1~3,4~8,1~3,less1,1,0,0,1,0,0
2,320906,No Urgent Place,Alone,Sunny,80,10AM,Coffee House,2h,Female,21,Married partner,0,Some college - no degree,Student,Less than $12500,,less1,gt8,4~8,1~3,1~3,1,1,0,0,1,0
3,412393,Work,Alone,Rainy,55,7AM,Restaurant(<20),2h,Female,26,Single,0,Bachelors degree,Computer & Mathematical,$25000 - $37499,,less1,less1,4~8,1~3,never,1,1,1,0,1,0
4,290854,Home,Alone,Snowy,30,6PM,Coffee House,1d,Male,31,Single,0,Some college - no degree,Unemployed,$12500 - $24999,,1~3,less1,4~8,less1,never,1,1,0,0,1,0


In [None]:
drop_columns = ['customer_id','direction_opp']
dfModified = df.copy()

In [None]:
# Converting Gender into a binary class
dfModified['gender'] = (dfModified['gender'] == "Male").astype(int)

# Combining Rainy and Snowy into one class
dfModified['is_sunny'] = (dfModified['weather'] == 'Sunny').astype(int)
drop_columns.append('weather')

# Converting expiration into binary class
dfModified['expiration_oneDay'] = (dfModified['expiration'] == '1d').astype(int)
drop_columns.append('expiration')

In [None]:
# Converting the Temperature column into numeric labels
dfModified['temperature_index'] = 3
dfModified.loc[dfModified['temperature'] == 30,'temperature_index'] = 1 
dfModified.loc[dfModified['temperature'] == 55,'temperature_index'] = 2
drop_columns.append('temperature')

# Converting the time column into numeric labels
dfModified['time_index'] = 0
dfModified.loc[dfModified['time'] == '10AM','time_index'] = 1 
dfModified.loc[dfModified['time'] == '2PM','time_index'] = 2
dfModified.loc[dfModified['time'] == '6PM','time_index'] = 3
dfModified.loc[dfModified['time'] == '10PM','time_index'] = 4
drop_columns.append('time')

In [None]:
def is_often(val):
  often_class = ['gt8','4~8']
  return (val in often_class)

# Creating is often classes
dfModified['Bar_isoften'] = dfModified['Bar'].apply(is_often).astype(int)
drop_columns.append('Bar')

dfModified['CoffeeHouse_isoften'] = dfModified['CoffeeHouse'].apply(is_often).astype(int)
drop_columns.append('CoffeeHouse')

dfModified['CarryAway_isoften'] = dfModified['CarryAway'].apply(is_often).astype(int)
drop_columns.append('CarryAway')

dfModified['RestaurantLessThan20_isoften'] = dfModified['RestaurantLessThan20'].apply(is_often).astype(int)
drop_columns.append('RestaurantLessThan20')

dfModified['Restaurant20To50_isoften'] = dfModified['Restaurant20To50'].apply(is_often).astype(int)
drop_columns.append('Restaurant20To50')

In [None]:
def income_mean(val):
  incomeGroup = '\d+'
  incClass = [int(dig) for dig in re.findall(incomeGroup, val)]
  if len(incClass) == 2:
    return np.mean(incClass)
  elif 'Less than' in val:
    return 6250.0
  else:
    return 150000.0


dfModified['income_mean'] = dfModified['income'].apply(income_mean)
drop_columns.append('income')

In [None]:
groupOccupation_other = ['Farming Fishing & Forestry',
 'Building & Grounds Cleaning & Maintenance',
 'Production Occupations',
 'Installation Maintenance & Repair',
 'Construction & Extraction',
 'Protective Service',
 'Personal Care & Service',
 'Architecture & Engineering',
 'Life Physical Social Science',
 'Transportation & Material Moving',
 'Legal',
 'Healthcare Support',
 'Community & Social Services',
 'Healthcare Practitioners & Technical',
 'Food Preparation & Serving Related']

dfModified.loc[dfModified['occupation'].isin(groupOccupation_other), 'occupation'] = "other"

In [None]:
print(dfModified.shape)
dfModified.drop(set(drop_columns),axis=1, inplace=True)
print(dfModified.shape)

(10147, 37)
(10147, 25)


In [None]:
featuresColumns = pd.get_dummies(dfModified.drop(['Y'], axis=1))
featuresColumns.columns = [re.sub('[^a-zA-Z0-9]', '', col) for col in featuresColumns.columns]
TargetColumns = dfModified['Y']

In [None]:
drop_red_columns = ['destinationHome','destinationWork', 'educationAssociatesdegree','educationHighSchoolGraduate','educationSomeHighSchool', 'RestaurantLessThan20isoften',
                    'age41','age46', 'age50plus','CarryAwayisoften','haschildren',
                    'agebelow21', 'toCouponGEQ5min']
featuresColumns.drop(drop_red_columns, axis=1, inplace=True)

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(featuresColumns, TargetColumns, test_size=0.2, random_state=20)

In [None]:
tree = DecisionTreeClassifier(random_state = 20)
tree.fit(xtrain, ytrain)
ypred = tree.predict(xtest)
print(f"Accuracy is {accuracy_score(ytest, ypred)*100}")

Accuracy is 62.906403940886705


In [None]:
DtreeImportantFeatures = pd.DataFrame({'Column': featuresColumns.columns,
             'importance': tree.feature_importances_}).sort_values(by="importance", ascending=False)
DtreeImportantFeatures.to_csv('./DtreeImportantFeatures.csv', index=False)

In [None]:
grid2 = GridSearchCV(RandomForestClassifier(), 
                     {'n_estimators': list(range(0,500, 50)), 'criterion':["entropy"], 'max_depth':[None]}, 
                     refit = True, verbose = 3, n_jobs=-1)

grid2.fit(featuresColumns, TargetColumns)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:   35.7s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  1.9min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              ra

In [None]:
model_1 = RandomForestClassifier(criterion="entropy", n_estimators=400)
model_2 = LogisticRegression()
model_3 = xgb.XGBClassifier()


model_1.fit(xtrain, ytrain)
model_2.fit(xtrain, ytrain)
model_3.fit(xtrain, ytrain)

predprob1 = model_1.predict_proba(xtest)
predprob2 = model_2.predict_proba(xtest)
predprob3 = model_3.predict_proba(xtest)

predProbability = (predprob1+predprob2+predprob3)/3.0
pred = [0 if prob[0] > prob[1] else 1 for prob in predProbability]

In [None]:
boosterGrid = GridSearchCV(xgb.XGBClassifier(),
                           {'n_estimator':list(range(50,500, 20)), 'learning_rate':[0.1, 0.001, 1, 1e-15], 'booster':['gbtree','gblinear','dart']},
                           refit = True, verbose = 3, n_jobs=-1)
boosterGrid.fit(featuresColumns, TargetColumns)
print(boosterGrid.best_score_)

In [None]:
modelBagging = BaggingClassifier(base_estimator=RandomForestClassifier(max_depth=10), n_estimators=1000)

modelBagging.fit(xtrain, ytrain)
ypred = modelBagging.predict(xtest)
print(f"Accuracy is {accuracy_score(ytest, ypred)*100}")

In [None]:
testDF = pd.read_csv('/content/drive/MyDrive/TSW-hackathon-1/test.csv')
testDF.head()

Unnamed: 0,customer_id,destination,passanger,weather,temperature,time,coupon,expiration,gender,age,maritalStatus,has_children,education,occupation,income,car,Bar,CoffeeHouse,CarryAway,RestaurantLessThan20,Restaurant20To50,toCoupon_GEQ5min,toCoupon_GEQ15min,toCoupon_GEQ25min,direction_same,direction_opp
0,374679,No Urgent Place,Friend(s),Sunny,80,6PM,Coffee House,1d,Female,below21,Single,0,Some college - no degree,Healthcare Support,Less than $12500,,never,1~3,less1,4~8,less1,1,0,0,0,1
1,469678,Home,Alone,Sunny,80,6PM,Carry out & Take away,2h,Male,21,Unmarried partner,0,High School Graduate,Food Preparation & Serving Related,$12500 - $24999,,1~3,never,gt8,4~8,1~3,1,1,0,1,0
2,216140,No Urgent Place,Alone,Rainy,55,10AM,Coffee House,1d,Female,26,Married partner,1,Some college - no degree,Unemployed,$25000 - $37499,,never,never,1~3,less1,never,1,1,0,0,1
3,184301,No Urgent Place,Partner,Sunny,80,6PM,Bar,1d,Male,50plus,Unmarried partner,1,Associates degree,Retired,$50000 - $62499,,never,4~8,4~8,1~3,less1,1,1,0,0,1
4,148720,Work,Alone,Sunny,30,7AM,Carry out & Take away,1d,Female,26,Married partner,0,Some college - no degree,Office & Administrative Support,$75000 - $87499,,never,never,1~3,4~8,less1,1,1,0,0,1


In [None]:
drop_columns = ['customer_id','direction_opp']
dfModified_test = testDF.copy()

In [None]:
# Converting Gender into a binary class
dfModified_test['gender'] = (dfModified_test['gender'] == "Male").astype(int)

# Combining Rainy and Snowy into one class
dfModified_test['is_sunny'] = (dfModified_test['weather'] == 'Sunny').astype(int)
drop_columns.append('weather')

# Converting expiration into binary class
dfModified_test['expiration_oneDay'] = (dfModified_test['expiration'] == '1d').astype(int)
drop_columns.append('expiration')

# Converting the Temperature column into numeric labels
dfModified_test['temperature_index'] = 3
dfModified_test.loc[dfModified_test['temperature'] == 30,'temperature_index'] = 1 
dfModified_test.loc[dfModified_test['temperature'] == 55,'temperature_index'] = 2
drop_columns.append('temperature')

# Converting the time column into numeric labels
dfModified_test['time_index'] = 0
dfModified_test.loc[dfModified_test['time'] == '10AM','time_index'] = 1 
dfModified_test.loc[dfModified_test['time'] == '2PM','time_index'] = 2
dfModified_test.loc[dfModified_test['time'] == '6PM','time_index'] = 3
dfModified_test.loc[dfModified_test['time'] == '10PM','time_index'] = 4
drop_columns.append('time')

In [None]:
def is_often(val):
  often_class = ['gt8','4~8']
  return (val in often_class)

# Creating is often classes
dfModified_test['Bar_isoften'] = dfModified_test['Bar'].apply(is_often).astype(int)
drop_columns.append('Bar')

dfModified_test['CoffeeHouse_isoften'] = dfModified_test['CoffeeHouse'].apply(is_often).astype(int)
drop_columns.append('CoffeeHouse')

dfModified_test['CarryAway_isoften'] = dfModified_test['CarryAway'].apply(is_often).astype(int)
drop_columns.append('CarryAway')

dfModified_test['RestaurantLessThan20_isoften'] = dfModified_test['RestaurantLessThan20'].apply(is_often).astype(int)
drop_columns.append('RestaurantLessThan20')

dfModified_test['Restaurant20To50_isoften'] = dfModified_test['Restaurant20To50'].apply(is_often).astype(int)
drop_columns.append('Restaurant20To50')

In [None]:
def income_mean(val):
  incomeGroup = '\d+'
  incClass = [int(dig) for dig in re.findall(incomeGroup, val)]
  if len(incClass) == 2:
    return np.mean(incClass)
  elif 'Less than' in val:
    return 6250.0
  else:
    return 150000.0


dfModified_test['income_mean'] = dfModified_test['income'].apply(income_mean)
drop_columns.append('income')

In [None]:
groupOccupation_other = ['Farming Fishing & Forestry',
 'Building & Grounds Cleaning & Maintenance',
 'Production Occupations',
 'Installation Maintenance & Repair',
 'Construction & Extraction',
 'Protective Service',
 'Personal Care & Service',
 'Architecture & Engineering',
 'Life Physical Social Science',
 'Transportation & Material Moving',
 'Legal',
 'Healthcare Support',
 'Community & Social Services',
 'Healthcare Practitioners & Technical',
 'Food Preparation & Serving Related']

dfModified_test.loc[dfModified_test['occupation'].isin(groupOccupation_other), 'occupation'] = "other"

In [None]:
print(dfModified_test.shape)
dfModified_test.drop(set(drop_columns),axis=1, inplace=True)
print(dfModified_test.shape)

(2537, 36)
(2537, 24)


In [None]:
testFeaturesColumns = pd.get_dummies(dfModified_test)
testFeaturesColumns.columns = [re.sub('[^a-zA-Z0-9]', '', col) for col in testFeaturesColumns.columns]

In [None]:
drop_red_columns = ['destinationHome','destinationWork', 'educationAssociatesdegree','educationHighSchoolGraduate','educationSomeHighSchool', 'RestaurantLessThan20isoften',
                    'age41','age46', 'age50plus','CarryAwayisoften','haschildren',
                    'agebelow21', 'toCouponGEQ5min']
testFeaturesColumns.drop(drop_red_columns, axis=1, inplace=True)

In [None]:
prediction = grid2.predict(testFeaturesColumns)
pd.DataFrame({
    'customer_id': testDF['customer_id'],
    'prediction': prediction
}).to_csv('/content/drive/MyDrive/TSW-hackathon-1/OutputTest.csv')

In [None]:
grid2.best_score_

0.7211969302260085