# Data Import

In [9]:
import xgboost as xgb
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

Classification Models: LDA, QDA, Logistic regression, SVM, Decision Trees, Random Forests, various forms of Boosting

# 1 Tree based methods

## 1.1 Decision Tree

In [10]:
train = pd.read_csv('/Users/jerry/Desktop/udemy courses/Kaggle数据分析/Avito Duplicate Ads Detection/train.csv')

In [11]:
train.head()

Unnamed: 0.1,Unnamed: 0,isDuplicate,price,location,distance,description,title,category,region,parentCategory
0,0,1,0.0,1,0.0,1.0,1.0,1,1,1
1,1,1,0.1,1,0.0,0.922315,0.977778,1,1,1
2,2,0,0.0,1,0.0,0.561137,0.981799,1,1,1
3,3,1,0.0,1,9.65594,0.999173,0.788961,1,1,1
4,4,1,0.2,1,0.0,0.694822,0.360317,1,1,1


In [12]:
X = train.drop(['Unnamed: 0', 'isDuplicate'],axis=1)

In [13]:
y = train[['isDuplicate']]

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15)

changing pandas dataframe to numpy array
X_train = X_train.to_numpy()
y_train = y_train.to_numpy()

X_test = X_test.to_numpy()
y_test = y_test.to_numpy()

normalizing the data
sc = StandardScaler()
X_train = sc.fit_transform(X_train)

X_test = sc.fit_transform(X_test)



In [21]:
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier()
model = dtree.fit(X_train,y_train)

In [22]:
prediction = model.predict(X_test)

In [23]:
prediction

array([0, 1, 0, ..., 0, 0, 1])

In [15]:
from sklearn.metrics import classification_report, confusion_matrix

In [25]:
print(confusion_matrix(y_test, prediction))

[[167350  66675]
 [ 65366  98397]]


In [26]:
print(classification_report(y_test, prediction))

              precision    recall  f1-score   support

           0       0.72      0.72      0.72    234025
           1       0.60      0.60      0.60    163763

    accuracy                           0.67    397788
   macro avg       0.66      0.66      0.66    397788
weighted avg       0.67      0.67      0.67    397788



In [27]:
probability_prediction0 = model.predict_proba(X_test)[:,1]
probability_prediction = np.array(probability_prediction0)

In [28]:
from sklearn import metrics

fpr0, tpr0, thresholds0 = metrics.roc_curve(y_test, probability_prediction0)
metrics.auc(fpr0, tpr0)

0.6569333231300618

0.6539178180885833

We have a pretty good prediction already for the decision tree with high precision.


## 1.2 Random Forest

In [16]:
from sklearn.ensemble import RandomForestClassifier

In [17]:
rfc = RandomForestClassifier(n_estimators = 50)

In [18]:
rfc.fit(X_train, y_train)

  """Entry point for launching an IPython kernel.


RandomForestClassifier(n_estimators=50)

In [19]:
rfc_pred = rfc.predict(X_test)

In [20]:
print(confusion_matrix(y_test, rfc_pred))

[[184510  49453]
 [ 63846  99979]]


In [21]:
print(classification_report(y_test, rfc_pred))

              precision    recall  f1-score   support

           0       0.74      0.79      0.77    233963
           1       0.67      0.61      0.64    163825

    accuracy                           0.72    397788
   macro avg       0.71      0.70      0.70    397788
weighted avg       0.71      0.72      0.71    397788



There is no huge improvement over the simple decision tree but random forest is still a bit better. 

In [22]:
rfc_pred_prob = rfc.predict_proba(X_test)[:,1]

In [23]:
import numpy as np
from sklearn import metrics

In [24]:
y_test1 = np.array(y_test['isDuplicate'])
y_test1

array([1, 1, 0, ..., 0, 1, 0])

In [25]:
rfc_pred_prob1 = np.array(rfc_pred_prob)
rfc_pred_prob1

array([0.04      , 0.485     , 0.06      , ..., 0.82294034, 0.68294034,
       0.54      ])

In [26]:
fpr, tpr, thresholds = metrics.roc_curve(y_test1, rfc_pred_prob1)
metrics.auc(fpr, tpr)

0.7777082822559407

0.7767214458858888

The random forest has a slightly better outcome for AUC score. 

# Test Data

In [27]:
test = pd.read_csv('/Users/jerry/Desktop/udemy courses/Kaggle数据分析/Avito Duplicate Ads Detection/test.csv')

In [28]:
test.head(5)

Unnamed: 0.1,Unnamed: 0,index,price,location,distance,description,title,category,region,parentCategory
0,0,0,61703.659451,1,0.0,0.603247,0.829645,1,1,1
1,1,1,61703.659451,1,0.0,0.603247,0.829645,1,1,1
2,2,2,61703.659451,1,0.0,0.98776,1.0,1,1,1
3,3,3,1.653061,1,0.0,0.391037,0.638177,1,1,1
4,4,4,0.0,1,0.0,1.0,0.678009,1,1,1


In [29]:
X1 = test.drop(['Unnamed: 0','index'],axis=1)

In [38]:
X1.isna().sum() #price有很多nan，需要

price             0
location          0
distance          0
description       0
title             0
category          0
region            0
parentCategory    0
dtype: int64

In [39]:
list(X1.columns)

['price',
 'location',
 'distance',
 'description',
 'title',
 'category',
 'region',
 'parentCategory']

In [40]:
X1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1044196 entries, 0 to 1044195
Data columns (total 8 columns):
price             1044196 non-null float64
location          1044196 non-null int64
distance          1044196 non-null float64
description       1044196 non-null float64
title             1044196 non-null float64
category          1044196 non-null int64
region            1044196 non-null int64
parentCategory    1044196 non-null int64
dtypes: float64(4), int64(4)
memory usage: 63.7 MB


We do not need imputation in the case.

In [41]:
## Imputation for whole dataframe

In [42]:
#from sklearn.impute import SimpleImputer
#import numpy as np
#imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
#imp_mean.fit(X1)
#X2 = imp_mean.transform(X1)

In [43]:
#X2.shape

In [44]:
#X3 = pd.DataFrame(data = X2, columns=list(X1.columns))

In [45]:
#X3.head()

## Prediction

Use Random Forest to try out for prediction and submit to check the score. 

In [30]:
pred_test = rfc.predict_proba(X1)[:,1]
pred_test = np.array(pred_test)

In [31]:
pred_test

array([0.7 , 0.7 , 0.82, ..., 0.28, 0.14, 0.74])

In [32]:
len(pred_test)

1044196

In [33]:
index = np.array(test['index'].astype(np.int))

In [34]:
pred_test1 = {'id':index,'probability':pred_test}

In [35]:
pred_test2 = pd.DataFrame(pred_test1, columns =['id','probability'])

In [36]:
pred_test2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1044196 entries, 0 to 1044195
Data columns (total 2 columns):
id             1044196 non-null int64
probability    1044196 non-null float64
dtypes: float64(1), int64(1)
memory usage: 15.9 MB


In [37]:
pred_test2.to_csv('/Users/jerry/Desktop/udemy courses/Kaggle数据分析/Avito Duplicate Ads Detection/prediction.csv',index=False)

kaggle score: 0.72370