# Проект по курсу Машинное обучение
### 1курс маг. 6 группа. Полтавский Артем, Мезга Александр
### Датасет: https://archive.ics.uci.edu/ml/datasets/Dishonest+Internet+users+Dataset
### Референсная статья: https://www.hindawi.com/journals/complexity/2020/4579495/
### Точность предсказаний которую нужно превзойти: 93.87%

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import  confusion_matrix,accuracy_score,precision_score,recall_score,f1_score

In [2]:
# Загрузка данных. Первоначальный анализ

In [3]:
filename = "dishonest-internet-users-dataset.csv"
data= pd.read_csv(filename)
df = pd.DataFrame(data)
df

Unnamed: 0,ctrust,cuntrust,last,context,score
0,4,1,4,sport,untrustworthy
1,4,1,4,sport,untrustworthy
2,1,4,4,sport,trustworthy
3,2,1,4,sport,trustworthy
4,3,1,4,sport,trustworthy
5,4,1,4,sport,untrustworthy
6,1,4,4,sport,trustworthy
7,2,1,4,sport,trustworthy
8,3,1,4,sport,trustworthy
9,4,1,4,sport,untrustworthy


In [4]:
df.describe()

Unnamed: 0,ctrust,cuntrust,last
count,322.0,322.0,322.0
mean,2.195652,1.52795,2.36646
std,1.273592,1.105422,1.347384
min,1.0,1.0,1.0
25%,1.0,1.0,1.0
50%,2.0,1.0,2.0
75%,4.0,1.0,4.0
max,4.0,5.0,4.0


In [5]:
print('ctrust',df.ctrust.unique())
print('cuntrust',df.cuntrust.unique())
print('last',df['last'].unique())
print('context',df.context.unique())
print('score',df.score.unique())

ctrust [4 1 2 3]
cuntrust [1 4 3 5]
last [4 1 3 2]
context ['sport' 'game' 'ECommerce' 'holiday']
score ['untrustworthy' 'trustworthy']


# Перевод признаков из строк в категориальные значения

In [6]:
df['score'].replace({'untrustworthy':0,'trustworthy':1},inplace=True)
df

Unnamed: 0,ctrust,cuntrust,last,context,score
0,4,1,4,sport,0
1,4,1,4,sport,0
2,1,4,4,sport,1
3,2,1,4,sport,1
4,3,1,4,sport,1
5,4,1,4,sport,0
6,1,4,4,sport,1
7,2,1,4,sport,1
8,3,1,4,sport,1
9,4,1,4,sport,0


In [7]:
df['context'].replace({'sport':1,'game':2,'ECommerce':3,'holiday':4},inplace=True)
df

Unnamed: 0,ctrust,cuntrust,last,context,score
0,4,1,4,1,0
1,4,1,4,1,0
2,1,4,4,1,1
3,2,1,4,1,1
4,3,1,4,1,1
5,4,1,4,1,0
6,1,4,4,1,1
7,2,1,4,1,1
8,3,1,4,1,1
9,4,1,4,1,0


In [8]:
# Разделить данные на X и y, нормализовать данные

In [9]:
y=data.score.values
x_data=data.drop("score",axis=1)

In [11]:
x=(x_data-np.min(x_data))/(np.max(x_data)-np.min(x_data))
x

Unnamed: 0,ctrust,cuntrust,last,context
0,1.000000,0.00,1.000000,0.0
1,1.000000,0.00,1.000000,0.0
2,0.000000,0.75,1.000000,0.0
3,0.333333,0.00,1.000000,0.0
4,0.666667,0.00,1.000000,0.0
...,...,...,...,...
317,0.333333,0.00,0.333333,1.0
318,0.000000,0.00,0.000000,1.0
319,0.000000,0.00,0.000000,1.0
320,0.000000,0.00,0.000000,1.0


In [75]:
import seaborn as sns
from numpy import mean
from numpy import std
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import train_test_split

from tabulate import tabulate

In [34]:
# SVM

In [46]:
def calc_svm(x, y, train_size):
    x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=train_size,random_state=42)
    
    svclassifier = SVC(kernel='linear')
    svclassifier.fit(x_train, y_train)
    y_pred = svclassifier.predict(x_test)
    confu_svm=confusion_matrix(y_test,y_pred)
    accuracy_svm=accuracy_score(y_test,y_pred)
    precision_svm=precision_score(y_test,y_pred)
    recall_svm=recall_score(y_test,y_pred)
    f1_svm=f1_score(y_test,y_pred)
    return ([accuracy_svm,precision_svm,recall_svm,f1_svm], confu_svm)

In [47]:
# Naive Bayes

In [48]:
def calc_nb(x, y, train_size):
    x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=train_size,random_state=42)
    
    nb=GaussianNB()
    nb.fit(x_train,y_train)
    y_pred = nb.predict(x_test)
    confu_nb=confusion_matrix(y_test,y_pred)
    accuracy_nb=accuracy_score(y_test,y_pred)
    precision_nb=precision_score(y_test,y_pred)
    recall_nb=recall_score(y_test,y_pred)
    f1_nb=f1_score(y_test,y_pred)
    return ([accuracy_nb,precision_nb,recall_nb,f1_nb], confu_nb)

In [49]:
def compare_results(x, y, train_size=0.8):
    a='SVM'
    svm_results = calc_svm(x, y, train_size=train_size)
    b='Naive Bayes'
    nb_results = calc_nb(x, y, train_size=train_size)
    
    confu_nb = nb_results[1]
    result1=(a,*nb_results[0])
    
    confu_svm = svm_results[1]
    result2=(b,*svm_results[0])
    
    result=(result1,result2)
    print('confusion matrix SVM')
    print(confu_svm)

    print('confusion matrix Naive Bayes')
    print(confu_nb)

    print(tabulate(result, headers=["accuracy", "precision", "recall","f1_score"]))

In [66]:
compare_results(x,y,train_size=0.7)

confusion matrix SVM
[[16 11]
 [ 6 64]]
confusion matrix Naive Bayes
[[ 9 18]
 [ 5 65]]
               accuracy    precision    recall    f1_score
-----------  ----------  -----------  --------  ----------
SVM            0.762887     0.783133  0.928571    0.849673
Naive Bayes    0.824742     0.853333  0.914286    0.882759


In [67]:
def get_stacking():
    # define the base models
    level0 = list()
    level0.append(('svm', SVC()))
    level0.append(('bayes', GaussianNB()))
    # define meta learner model
    level1 = LogisticRegression()
    # define the stacking ensemble
    model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)
    print(model)
    return model

In [68]:
clf = get_stacking()

StackingClassifier(cv=5, estimators=[('svm', SVC()), ('bayes', GaussianNB())],
                   final_estimator=LogisticRegression())


In [78]:
stack_results = []
for train_size in [i*0.1 for i in range(1,10)]:
    X_train,X_test,y_train,y_test=train_test_split(x,y,train_size=train_size,random_state=37)
    acc = clf.fit(X_train, y_train).score(X_test, y_test)
    stack_results.append((acc, train_size))

In [79]:
stack_results

[(0.8068965517241379, 0.1),
 (0.9147286821705426, 0.2),
 (0.9823008849557522, 0.30000000000000004),
 (0.9845360824742269, 0.4),
 (0.9875776397515528, 0.5),
 (0.9844961240310077, 0.6000000000000001),
 (1.0, 0.7000000000000001),
 (1.0, 0.8),
 (1.0, 0.9)]

In [80]:
ax = sns.pointplot(x="acc", y="train_size",
                   data=stack_results, dodge=True)

AttributeError: 'list' object has no attribute 'get'