In [1]:
import pandas as pd
df = pd.read_csv('train.csv')

---
---

In [2]:
!pip install category_encoders # installing a new library (my first time)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting category_encoders
  Downloading category_encoders-2.5.1.post0-py2.py3-none-any.whl (72 kB)
[K     |████████████████████████████████| 72 kB 819 kB/s 
Installing collected packages: category-encoders
Successfully installed category-encoders-2.5.1.post0


# Preprocessing the data

---
---

In [3]:
# Checking if one hot encoding of duty regime is good?
# duty regime yeqinki az columu olar so, yeqin cox rotation etmez unit sphereda
# hemcinin, duty regimin ne qeder onemli oldugunu da olce bilerik

# Duty regime'i silmek ve onehot etmek cox xirda 0.89.. larda bir sey verdi. Yeni, nisbeten have etmek de olar amma almost 
# doest matter belkede buna gore (bele assume etmek olar belke de cunki random forestde oxsar tapdi)

# ********************

# TaxRate silinse ne olar? (sirada, issuedatetime?) TaxRate knn'i bir qeder 0.1 lerde azaltdi amma foresti 1 lerde
# yeni olsa sanki daha yaxsidi demek olar

# *********************
# Issue date time geri gelse ne olar? Sirada, agirligi ancaq issuedate time a vermek ucun, countryni silmek - Issue date time nope
# hemcinin, classification id'ni target encode etmek yeni ideyadir

# *********************
# check how important is classification id. You can say i think that export is important.

---
---

In [4]:
df['IssueDateTime'] = (df['IssueDateTime'].str.split('-').str.get(1)).astype(int)

---

---

In [5]:
df['ClassificationID'] = df['ClassificationID'] // 100000000

---

---

In [6]:
deleted_columns = ['IssueDateTime', 'ID', 'ProcessType', 'TransactionNature', 'Type', 'PaymentType', 'DeclarerID', 'ImporterID',
                   'SellerID', 'ExpressID', 'DisplayIndicator', 'OriginCountry', 'BorderTransportMeans', 'DeclarationOfficeID', 'DutyRegime']

---

---

In [7]:
df = df.drop(df[deleted_columns], axis = 1)

---

---

In [8]:
label_encoded_column = ['ClassificationID']

---

---

In [9]:
for label in label_encoded_column :
  df[label] = pd.factorize(df[label], sort = True)[0]

---
---

In [10]:
one_hot_columns_knn = ['ExportationCountry']

---
---

In [11]:
 df = pd.get_dummies(df, columns = one_hot_columns_knn, drop_first=True)

---
---

In [12]:
features = [col for col in df.columns if col != 'Fake']

---
---

In [13]:
from category_encoders import TargetEncoder

---
---

In [14]:
# encoder = TargetEncoder()
# export_column = df['ExportationCountry']
# df['ExportationCountry'] = encoder.fit_transform(df['ExportationCountry'], df['Fake'])

---
---

---
---
---
---

# Models 

---
---

In [15]:
models = {}

# Logistic Regression
from sklearn.linear_model import LogisticRegression
models['Logistic Regression'] = LogisticRegression()

# Support Vector Machines
from sklearn.svm import LinearSVC
models['Support Vector Machines'] = LinearSVC()

# Decision Trees
from sklearn.tree import DecisionTreeClassifier
models['Decision Trees'] = DecisionTreeClassifier()

# Random Forest
from sklearn.ensemble import RandomForestClassifier
models['Random Forest'] = RandomForestClassifier()

# Naive Bayes
from sklearn.naive_bayes import GaussianNB
models['Naive Bayes'] = GaussianNB()

# K-Nearest Neighbors
from sklearn.neighbors import KNeighborsClassifier
models['K-Nearest Neighbor'] = KNeighborsClassifier(n_neighbors = 12, weights = 'distance')

---
---

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df[features], df['Fake'], test_size = 0.25 )

---
---

In [17]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
accuracy, precision, recall = {}, {}, {}

for key in models.keys():
    
    # Fit the classifier model
    models[key].fit(X_train, y_train)
    
    # Prediction 
    predictions = models[key].predict(X_test)
    
    # Calculate Accuracy, Precision and Recall Metrics
    accuracy[key] = accuracy_score(predictions, y_test)
    precision[key] = precision_score(predictions, y_test)
    recall[key] = recall_score(predictions, y_test)



---
---

In [18]:
df_model = pd.DataFrame(index=models.keys(), columns=['Accuracy', 'Precision', 'Recall'])
df_model['Accuracy'] = accuracy.values()
df_model['Precision'] = precision.values()
df_model['Recall'] = recall.values()

df_model

Unnamed: 0,Accuracy,Precision,Recall
Logistic Regression,0.778976,0.00383,0.470588
Support Vector Machines,0.777284,0.011489,0.369231
Decision Trees,0.869395,0.692676,0.709314
Random Forest,0.89319,0.6472,0.832
Naive Bayes,0.778976,0.000479,0.333333
K-Nearest Neighbor,0.891497,0.63954,0.830329


---
---

In [19]:
# k_values = {}
# other_accuracy = {}
# for i in range(10,40) :
#   k_values[i] = KNeighborsClassifier(n_neighbors = i+1, weights = 'distance')
#   k_values[i].fit(X_train, y_train)
#   predictions = k_values[i].predict(X_test)
#   other_accuracy[i] = accuracy_score(predictions, y_test)

# accuracy_df = pd.DataFrame(index = ['accuracy'], data = other_accuracy)
# accuracy_df

---
---

# GridSearch KNN
---

---
---

In [20]:
from sklearn.model_selection import GridSearchCV 
knn = KNeighborsClassifier()
gsearch = GridSearchCV( estimator = knn, 
                        param_grid = {'n_neighbors' : list(range(150, 151)),'weights' : ['distance'], 'p' : [1]}, 
                        cv = 5, 
                        scoring = 'accuracy')

gsearch.fit(X_train, y_train)

# gsearch.cv_results_

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': [150], 'p': [1],
                         'weights': ['distance']},
             scoring='accuracy')

In [21]:
# print(gsearch.cv_results_['mean_test_score'])
# print('\n')
# print(gsearch.cv_results_['params'])
for param, result in zip(gsearch.cv_results_['params'], gsearch.cv_results_['mean_test_score']):
  print(param, ' - result: ', result)

{'n_neighbors': 150, 'p': 1, 'weights': 'distance'}  - result:  0.8996336777844531


In [22]:
gsearch.best_estimator_

KNeighborsClassifier(n_neighbors=150, p=1, weights='distance')

In [23]:
knn_best = gsearch.best_estimator_

In [None]:
# 59 0.8991637723981538 (current knn_best I hope)
# 28 0.8955282554772646 (duty regime one hot encoded)
# 136 0.8997587590739784 (no duty regime, exportcountry onehotted)
# 384 0.8976764968575655 (current target encoded export's best)

# Random Forest Grid Search
---

Forest Data preprocessing
---

---
---

In [None]:
# knn deki target encoding belke forest ucun daha yaxsidi.En azindan, basqa modellede eyni neticenin ne derece alindigini gormek target encoding
# in yaxsiligi haqda info verir

---
---

In [None]:
df_forest = pd.read_csv('train.csv')

---
---

In [None]:
df_forest['IssueDateTime'] = (df_forest['IssueDateTime'].str.split('-').str.get(1)).astype(int)

---
---

In [None]:
df_forest['ClassificationID'] = df_forest['ClassificationID'] // 100000000

---
---

In [None]:
deleted_columns_forest = ['IssueDateTime', 'ID', 'ProcessType', 'TransactionNature', 'Type', 'PaymentType', 'DeclarerID', 'ImporterID',
                   'SellerID', 'ExpressID', 'DisplayIndicator', 'OriginCountry', 'DeclarationOfficeID', 'BorderTransportMeans', 'DutyRegime']

---
---

In [None]:
df_forest = df_forest.drop(df_forest[deleted_columns_forest], axis = 1)

---
---

In [None]:
label_encoded_column_forest = ['ExportationCountry']

---
---

In [None]:
for label in label_encoded_column_forest :
  df_forest[label] = pd.factorize(df_forest[label], sort = True)[0]

---
---

In [None]:
one_hot_columns_forest = ['ClassificationID']

---
---

In [None]:
df_forest = pd.get_dummies(df_forest, columns= one_hot_columns_forest, drop_first=True)

---
---

In [None]:
features_forest = [col for col in df_forest if col != 'Fake']

---
---

In [None]:
X_train_forest, X_test_forest, y_train_forest, y_test_forest = train_test_split(df_forest[features_forest], df_forest['Fake'], test_size = 0.2 )

---
---

Forest Grid
---

In [None]:
randomforest = RandomForestClassifier()
randomgrid = GridSearchCV( estimator = randomforest, 
                        param_grid = {'n_estimators' : [300, 350, 425],
                                      }, 
                        cv = 8, 
                        scoring = 'accuracy')

randomgrid.fit(X_train_forest, y_train_forest)

# gsearch.cv_results_

GridSearchCV(cv=8, estimator=RandomForestClassifier(),
             param_grid={'n_estimators': [300, 350, 425]}, scoring='accuracy')

---
---

In [None]:
for param, result in zip(randomgrid.cv_results_['params'], randomgrid.cv_results_['mean_test_score']):
  print(param, ' - result: ', result)

{'n_estimators': 300}  - result:  0.8929834344774621
{'n_estimators': 350}  - result:  0.8924546399580746
{'n_estimators': 425}  - result:  0.8929173580966256


---
---

In [None]:
randomgrid.best_estimator_

RandomForestClassifier(n_estimators=300)

---
---

In [None]:
randomforest_best = randomgrid.best_estimator_

In [None]:
# 0.8911656394292539
# 0.8953959344267886 (with no duty regime and export one hot encoded)

In [None]:
# wwhich columns to include and drop
# if you include should you do label encoding, target encoding, onehotencoding

# Submission to Kaggle

randomforest
---

In [None]:
#

df_kgl_forest = pd.read_csv('test.csv')
df_kgl_forest['IssueDateTime'] = (df_kgl_forest['IssueDateTime'].str.split('-').str.get(1)).astype(int)
df_kgl_forest = df_kgl_forest.drop(df_kgl_forest[deleted_columns_forest], axis = 1)
df_kgl_forest['ClassificationID'] = df_kgl_forest['ClassificationID'] // 100000000
#
for label in label_encoded_column_forest :
  df_kgl_forest[label] = pd.factorize(df_kgl_forest[label], sort = True)[0]
#
df_kgl_forest = pd.get_dummies(df_kgl_forest, columns=one_hot_columns_knn, drop_first=True)


---
---

In [None]:
missing_columns = [col for col in X_train_forest if col not in df_kgl_forest]

---
---

In [None]:
for col in missing_columns:
  df_kgl_forest[col] = 0

---
---

In [None]:
additional_columns_forest = []
for col in df_kgl_forest:
  if col not in X_train_forest:
    additional_columns_forest.append(col)

---
---

In [None]:
df_kgl_forest = df_kgl_forest.drop(df_kgl_forest[additional_columns_forest], axis = 1)

---
---

In [None]:
col_forest = randomforest_best.predict(df_kgl_forest)
submit_forest = df_kgl_forest.copy()
submit_forest = submit_forest.drop(columns = df_kgl_forest.columns, axis = 1)
submit_forest['ID'] = pd.read_csv('test.csv')['ID']
submit_forest['Fake'] = col_forest

Feature names must be in the same order as they were in fit.



---
---

In [None]:
submit_forest.to_csv('randomforest.csv',index = False)

knn
---

In [25]:
#

df_kgl_knn = pd.read_csv('test.csv')
df_kgl_knn['IssueDateTime'] = (df_kgl_knn['IssueDateTime'].str.split('-').str.get(1)).astype(int)
df_kgl_knn = df_kgl_knn.drop(df_kgl_knn[deleted_columns], axis = 1)
df_kgl_knn['ClassificationID'] = df_kgl_knn['ClassificationID'] // 100000000

for label in label_encoded_column :
  df_kgl_knn[label] = pd.factorize(df_kgl_knn[label], sort = True)[0]

df_kgl_knn = pd.get_dummies(df_kgl_knn, columns = one_hot_columns_knn, drop_first=True)


---
---

In [26]:
missing_columns_knn = [col for col in X_train if col not in df_kgl_knn]

---
---

In [27]:
for col in missing_columns_knn:
  df_kgl_knn[col] = 0

---
---

In [28]:
additional_columns = []
for col in df_kgl_knn:
  if col not in X_train:
    additional_columns.append(col)

---
---

In [29]:
df_kgl_knn = df_kgl_knn.drop(df_kgl_knn[additional_columns], axis = 1)

---
---

In [None]:
# data = {'column 1' : [1, 2, 5], 'column 2' : [10, 76, 89]}
# test_df = pd.DataFrame(data)

In [None]:
# for index, row in test_df.iterrows():
#   if index %2 == 0:
#     row['column 1'] = 100

In [None]:
#  test_df

---
---

In [30]:
# for index_kgl, row_kgl_knn in df_kgl_knn.iterrows() :
#   index = -1
#   for country in export_column :
#     index = index + 1
#     if country == row_kgl_knn['ExportationCountry']:
#       row_kgl_knn['ExportationCountry'] = df['ExportationCountry'][index]
#       df_kgl_knn.iloc[index_kgl] = row_kgl_knn
#       break

---
---

In [32]:
# additional_countries_knn = [row['ExportationCountry'] for index, row in df_kgl_knn.iterrows() 
#                               if row['ExportationCountry'] not in export_column]

---
---

In [None]:
col_knn = knn_best.predict(df_kgl_knn)
submit_knn = df_kgl_knn.copy()
submit_knn = submit_knn.drop(columns = df_kgl_knn.columns, axis = 1)
submit_knn['ID'] = pd.read_csv('test.csv')['ID']
submit_knn['Fake'] = col_knn

---
---

In [None]:
submit_knn.to_csv('knn.csv',index = False)