Notice that all .to_csv command has been commented.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
# import dataset
train_DF = pd.read_csv("train.csv")
test_DF = pd.read_csv("test.csv")

In [None]:
# Overview
train_Overview = train_DF.drop("id" ,axis= 1).drop("target" ,axis= 1)
test_Overview = test_DF.drop("id" ,axis= 1)
instances_train = train_Overview.shape[0]
instances_test = test_Overview.shape[0]
features_train = train_Overview.shape[1]
features_test = test_Overview.shape[1]
print("Data Overview of the train data:")
print("Number of instances:",instances_train)
print("Number of features:",features_train)
print("Data Overview of the test data:")
print("Number of instances:",instances_test)
print("Number of features:",features_test)

Data Overview of the train data:
Number of instances: 250
Number of features: 300
Data Overview of the test data:
Number of instances: 19750
Number of features: 300


In [None]:
# Correlation & Feature Reduction
train_Corr = train_DF.drop("id" ,axis= 1).corr().abs()
upper = train_Corr.where(np.triu(np.ones(train_Corr.shape), k=1).astype(bool))
to_keep = [column for column in upper.columns if not any(upper[column] < 0.0002)]
print(len(to_keep))
reduced_train_DF = train_DF[to_keep[1:]]
reduced_test_DF = test_DF[to_keep[1:]]

# Train Model
logRegModel = LogisticRegression()
train_Y = train_DF["target"]
train_X = reduced_train_DF
test_X = reduced_test_DF

logRegModel.fit(train_X, train_Y)
predicted_target = logRegModel.predict(test_X)
combined_DF = pd.DataFrame({
    'id': test_DF["id"],
    'target': predicted_target
}) 
#combined_DF.to_csv('submission_2.csv',index=False)

216


In [None]:
# Regularization
train_X = train_DF.drop("id" ,axis= 1).drop("target" ,axis= 1)
train_Y = train_DF["target"]
test_X = test_DF.drop("id" ,axis= 1)

logRegModel_L1 = LogisticRegression(penalty="l1", C=1, solver='liblinear')
logRegModel_L2 = LogisticRegression(penalty="l2", C=1)

logRegModel_L1.fit(train_X, train_Y)
logRegModel_L2.fit(train_X, train_Y)

predicted_target_L1 = logRegModel_L1.predict(test_X)
predicted_target_L2 = logRegModel_L2.predict(test_X)

L1_DF = pd.DataFrame({
    'id': test_DF["id"],
    'target': predicted_target_L1
})

L2_DF = pd.DataFrame({
    'id': test_DF["id"],
    'target': predicted_target_L2
})

#L1_DF.to_csv('submission_L1.csv',index=False)
#L2_DF.to_csv('submission_L2.csv',index=False)

In [None]:
# Cross Validation
train_X = train_DF.drop("id" ,axis= 1).drop("target" ,axis= 1)
train_Y = train_DF["target"]
test_X = test_DF.drop("id" ,axis= 1)
logRegCVModel = LogisticRegressionCV(cv=5, penalty="l1", solver='liblinear')
logRegCVModel.fit(train_X, train_Y)

predicted_target_CV = logRegCVModel.predict(test_X)

CV_DF = pd.DataFrame({
    'id': test_DF["id"],
    'target': predicted_target_CV
})

#CV_DF.to_csv('submission_LR4CV.csv',index=False)

In [None]:
# Ensemble Methods
train_X = train_DF.drop("id" ,axis= 1).drop("target" ,axis= 1)
train_Y = train_DF["target"]
test_X = test_DF.drop("id" ,axis= 1)

# Random Forest Classifier
RanForModel = RandomForestClassifier(n_estimators=20)
RanForModel.fit(train_X,train_Y)
predicted_target_RanFor = RanForModel.predict(test_X)

RanFor_DF = pd.DataFrame({
    'id': test_DF["id"],
    'target': predicted_target_RanFor
})
#RanFor_DF.to_csv('submission_RanFor.csv',index=False)

# XGBoost
XGBoostModel = XGBClassifier()
XGBoostModel.fit(train_X,train_Y)
predicted_target_XGBoost = XGBoostModel.predict(test_X)
XGBoost_DF = pd.DataFrame({
    'id': test_DF["id"],
    'target': predicted_target_XGBoost
})
#XGBoost_DF.to_csv('submission_XGB.csv',index=False)

# AdaBoost
AdaBoostModel = AdaBoostClassifier(DecisionTreeClassifier(), n_estimators=20)
AdaBoostModel.fit(train_X,train_Y)
predicted_target_AdaBoost = AdaBoostModel.predict(test_X)

AdaBoost_DF = pd.DataFrame({
    'id': test_DF["id"],
    'target': predicted_target_AdaBoost
})
#AdaBoost_DF.to_csv('submission_AdaBoost.csv',index=False)

# Voting
# Reference: https://www.youtube.com/watch?v=X3Wbfb4M33w
LogRegModel_Vot = LogisticRegression()
DecTreModel_Vot = DecisionTreeClassifier()
SVMModel_Vot = SVC(kernel = 'poly', degree=2)
VotingModel = VotingClassifier(
    voting = 'hard',
    estimators=[
        ('lr',LogRegModel_Vot),
        ('dt',DecTreModel_Vot),
        ('svm',SVMModel_Vot)
    ])
VotingModel.fit(train_X,train_Y)
predicted_target_Voting = VotingModel.predict(test_X)
Voting_DF = pd.DataFrame({
    'id': test_DF["id"],
    'target': predicted_target_Voting
})
#Voting_DF.to_csv('submission_Voting.csv',index=False)

In [None]:
# Improvement and tunning
train_X = train_DF.drop("id" ,axis= 1).drop("target" ,axis= 1)
train_Y = train_DF["target"]
test_X = test_DF.drop("id" ,axis= 1)

logRegModel_Tun = LogisticRegression(penalty="l1", C=0.1, solver='liblinear')
logRegModel_Tun.fit(train_X, train_Y)
predicted_target_Tun = logRegModel_Tun.predict(test_X)

LR_DF = pd.DataFrame({
    'id': test_DF["id"],
    'target': predicted_target_Tun
})
#LR_DF.to_csv('submission_LRTUN.csv',index=False)


logRegModelCV_Tun = LogisticRegressionCV(penalty='l1',Cs=[0.95, 0.1, 0.15, 0.2, 0.25, 0.3], cv=5, solver='liblinear')
logRegModelCV_Tun.fit(train_X, train_Y)
predicted_target_CVTun = logRegModelCV_Tun.predict(test_X)

LRCV_DF = pd.DataFrame({
    'id': test_DF["id"],
    'target': predicted_target_CVTun
})
#LRCV_DF.to_csv('submission_LRCVTUN.csv',index=False)