# K nearest neighbors

In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn import neighbors, datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# load csv file
df = pd.read_csv('./dataset/seeds_dataset.csv', header=None)

X = df[[0, 1, 2, 3, 4, 5, 6]]
y = df[7]-1  #因為他的種類索引是從1開始，我們習慣從0，所以調整一下

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)

model = neighbors.KNeighborsClassifier()
model.fit(X_train, y_train)


X_test = scaler.transform(X_test)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
num_correct_samples = accuracy_score(y_test, y_pred, normalize=False)

print('number of correct sample: {}'.format(num_correct_samples))
print('accuracy: {}'.format(accuracy))


number of correct sample: 38
accuracy: 0.9047619047619048


# Decision Tree (CART)

In [42]:
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn import neighbors
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

# load csv file
df = pd.read_csv('./dataset/abalone.csv', header=None)
df[0] = pd.Categorical(df[0]).codes
df[8] = df[8].apply(lambda x: 0 if x> 8 else 1)


X = df[[0, 1, 2, 3, 4, 5, 6,7]]
y = df[8]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)

model = DecisionTreeClassifier()
model.fit(X_train, y_train)

X_test = scaler.transform(X_test)
y_pred = model.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
num_correct_samples = accuracy_score(y_test, y_pred, normalize=False)
con_matrix = confusion_matrix(y_test, y_pred)

print('number of correct sample: {}'.format(num_correct_samples))
print('accuracy: {}'.format(accuracy))
print('con_matrix: {}'.format(con_matrix))



number of correct sample: 645
accuracy: 0.7715311004784688
con_matrix: [[443  91]
 [100 202]]


# Naive Bayes
ref: https://blog.sicara.com/naive-bayes-classifier-sklearn-python-example-tips-42d100429e44  
dataset: https://www.kaggle.com/c/titanic/data


In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB

# Importing dataset
df = pd.read_csv("./dataset/titanic/train.csv")

df['Sex'] = pd.Categorical(df['Sex']).codes
df['Embarked'] = pd.Categorical(df['Embarked']).codes


df=df[["Survived", "Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]].dropna(axis=0, how='any')

X_train, X_test = train_test_split(df, test_size=0.5)

gnb = GaussianNB()
used_features =[
    "Pclass",
    "Sex",
    "Age",
    "SibSp",
    "Parch",
    "Fare",
    "Embarked"
]

# Train classifier
gnb.fit(
    X_train[used_features].values,
    X_train["Survived"]
)
y_pred = gnb.predict(X_test[used_features])

# Print results
print("Number of mislabeled points out of a total {} points : {}, performance {:05.2f}%"
      .format(
          X_test.shape[0],
          (X_test["Survived"] != y_pred).sum(),
          100*(1-(X_test["Survived"] != y_pred).sum()/X_test.shape[0])
))

accuracy = accuracy_score(X_test["Survived"], y_pred)
num_correct_samples = accuracy_score(X_test["Survived"], y_pred, normalize=False)

print('number of correct sample: {}'.format(num_correct_samples))
print('accuracy: {}'.format(accuracy))



Number of mislabeled points out of a total 357 points : 81, performance 77.31%
number of correct sample: 276
accuracy: 0.773109243697479


# Random Forest

In [13]:
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn import neighbors, datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


# load csv file
df = pd.read_csv('./dataset/seeds_dataset.csv', header=None)

X = df[[0, 1, 2, 3, 4, 5, 6]]
y = df[7]-1

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)


model = RandomForestClassifier(max_depth=7, random_state=0)
model.fit(X_train, y_train)


X_test = scaler.transform(X_test)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
num_correct_samples = accuracy_score(y_test, y_pred, normalize=False)

print('number of correct sample: {}'.format(num_correct_samples))
print('accuracy: {}'.format(accuracy))


number of correct sample: 41
accuracy: 0.9761904761904762


# Support Vector Machine

In [26]:
import pandas as pd
import numpy as np
from sklearn import preprocessing, linear_model
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.svm import SVC

pima = pd.read_csv('./dataset/pima-indians-diabetes.csv')

#用'pregnant','insulin','bmi', 'age' 三個變數預測'label'(是否發病)
df=pima[['pregnant', 'insulin', 'bmi', 'age', 'label']]


X=df[['pregnant', 'insulin', 'bmi', 'age']]
y=df['label']

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=1) #random_state 種子值

scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)

model = SVC(kernel='rbf')
model.fit(X_train, y_train) 

X_test = scaler.transform(X_test)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
num_correct_samples = accuracy_score(y_test, y_pred, normalize=False)

print('number of correct sample: {}'.format(num_correct_samples))
print('accuracy: {}'.format(accuracy))


number of correct sample: 160
accuracy: 0.6926406926406926
