We will be looking at a competition on Kaggle: Santander Customer Transaction Prediction
In this challenge, we will identify which customers will make a specific transaction in the future, irrespective of the amount of money transacted.

# Importing libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC 
from sklearn.naive_bayes import GaussianNB          
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score

# Reading and Previewing the data

In [None]:
#Train data
train=pd.read_csv('train.csv')
train.head()

In [None]:
#Test Data
test=pd.read_csv('test.csv')
test.head()

In [None]:
#Submission data
submission=pd.read_csv('sample_submission.csv')
submission.head()

In [None]:
#Checking the shape of the data
train.shape

In [None]:
#Checking the structure of the columns
train.info()

In [None]:
test.shape

# Exploratory Data Analysis

In [None]:
#Checking for missing values
train.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
#Checking for duplicates
train.duplicated().sum()

In [None]:
test.duplicated().sum()

# Visualizations

In [None]:
#Correlation matrix
corr_matrix=train.corr()
corr_matrix

# Histograms

In [None]:
train['target'].value_counts().plot.bar();

# Train-Test-Split

In [None]:
#Creating Labels and Attributes
X=train.iloc[:,2:].values
y=train.iloc[:,1].values

In [None]:
X_test  = test.drop("ID_code",axis=1)

In [None]:
#train-test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=101)

# Training and Predicting the data

In [None]:
#Creating instances
logistic_classifier = LogisticRegression(random_state = 0)
decision_classifier = DecisionTreeClassifier()
random_forest_classifier= RandomForestClassifier()
svm_classifier = SVC()
knn_classifier = KNeighborsClassifier(n_neighbors=5)
naive_classifier = GaussianNB().fit(X_train, y_train)

#Training the data
logistic_classifier.fit(X_train,y_train) 
decision_classifier.fit(X_train,y_train) 
random_forest_classifier.fit(X_train,y_train)
svm_classifier.fit(X_train,y_train)
knn_classifier.fit(X_train,y_train) 
naive_classifier.fit(X_train,y_train)

In [None]:
#Predicting the data
logistic_prediction= logistic_classifier.predict(X_test)
decision_prediction= decision_classifier.predict(X_test)
random_forest_prediction= random_forest_classifier.predict(X_test)
svm_prediction= svm_classifier.predict(X_test)
knn_prediction= knn_classifier.predict(X_test)
naive_prediction= naive_classifier.predict(X_test)

# Evaluating the model

## Accuracy Score

In [None]:
print(accuracy_score(logistic_prediction,y_test))
print(accuracy_score(decision_prediction,y_test))
print(accuracy_score(random_forest_prediction,y_test))
print(accuracy_score(svm_prediction,y_test))
print(accuracy_score(knn_prediction,y_test))
print(accuracy_score(naive_prediction,y_test))

## Classification Report

In [None]:
print('Logistic Classifier:')
print(classification_report(logistic_prediction,y_test))

print('Decision Tree Classifier:')
print(classification_report(decision_prediction,y_test))

print('Random Forest Classifier:')
print(classification_report(random_forest_prediction,y_test))

print('SVM Classifier:')
print(classification_report(svm_prediction,y_test))

print('KNN Classifier:')
print(classification_report(knn_prediction,y_test))

print('Naive Bayes Classifier')
print(classification_report(naive_prediction,y_test))

We can see that the Random Forest classifier performed the best

# Creating a submission file

In [None]:
sub_df = pd.DataFrame({"ID_code":test["ID_code"].values})
sub_df["target"] =random_forest_prediction
sub_df.to_csv("submission.csv", index=False)

In [None]:
submission_df = pd.DataFrame({
        "ID_code": test["ID_code"],
        "target": random_forest_prediction
    })
submission_df.to_csv('submission_df.csv', index=False)

In [None]:
sub_df.head()