In [65]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import KFold
from sklearn.cross_validation import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import scale
import matplotlib.pyplot as plt
from sklearn.datasets import load_boston
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Perceptron
from sklearn.preprocessing import StandardScaler

In [None]:
data = pd.read_csv("C:/Users/U_M0SLV/Downloads/titanic.csv")
data.dtypes

In [None]:
#Amount of male and female
data.groupby("Sex").agg({"PassengerId": lambda x: x.count()}).to_dict()['PassengerId']

In [None]:
#Survival rate
tmp=data.groupby("Survived").agg({"PassengerId": lambda x: x.count()}).to_dict()['PassengerId']
1.0*tmp[1]/(tmp[0]+tmp[1])

In [None]:
#1st class passengers' rate
tmp=data.groupby("Pclass").agg({"PassengerId": lambda x: x.count()}).to_dict()['PassengerId']
1.0*tmp[1]/data["PassengerId"].size

In [None]:
#mean and median
print "mean age: "+ str(data["Age"].mean())
print "mean age: "+ str(data["Age"].median())

In [None]:
#Pearson correlation
data.corr()['SibSp']['Parch']

In [None]:
#most popular name
data[data["Sex"] == 'female']["Name"].str.extract('(Miss\. |Mrs\.[A-Za-z ]*\()([A-Za-z]*)')[1].value_counts()[:1]

In [None]:
#decision tree classifier
clf = DecisionTreeClassifier(random_state=241)
tmp=data[["Pclass", "Fare", "Age", "Survived","Sex"]]
tmp=tmp.dropna()
#print tmp.shape
tmp = tmp.replace({"Sex": {"female": 0, "male": 1}})
tmp["Sex"]=tmp["Sex"].astype("category", categories=[0,1], ordered=False)
clf.fit(tmp[["Pclass", "Fare", "Age", "Sex"]],tmp["Survived"])
print clf.feature_importances_

# KNN and cross-validation

In [81]:
data = np.genfromtxt("C:/Users/U_M0SLV/Downloads/wine.data.txt", delimiter=',')
Y = data[:,0]
X = data[:,1:]

In [None]:
#perform cross-validation
kfold = KFold(X.size, n_folds=5, shuffle=True, random_state=42)
knn = KNeighborsClassifier(n_neighbors=1)


In [21]:
#perform cross-validation
kfl = KFold(Y.size, n_folds=5, shuffle=True, random_state=42)
scores = []
k_range = range(1,50)
for i in k_range:
    knn1 = KNeighborsClassifier(n_neighbors=i)
    scores.append(cross_val_score(knn1, X, Y, cv=kfl, scoring='accuracy').mean())

In [38]:
plt.plot(k_range, scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross-Validated Accuracy')
plt.show()

In [33]:
print "Minimum KNN error for k=" + str(scores.index(max(scores))+1) + " with error " + str(max(scores))

Minimum KNN error for k=1 with error 0.730476190476


In [36]:
X_norm = scale(X, axis=0)

In [37]:
#perform cross-validation
kfl = KFold(Y.size, n_folds=5, shuffle=True, random_state=42)
scores = []
k_range = range(1,50)
for i in k_range:
    knn1 = KNeighborsClassifier(n_neighbors=i)
    scores.append(cross_val_score(knn1, X_norm, Y, cv=kfl, scoring='accuracy').mean())

In [39]:
print "The best KNN error with scaling for k=" + str(scores.index(max(scores))+1) + " with error " + str(max(scores))

The best KNN error with scaling for k=29 with error 0.977619047619


# Regression based on k-nearest neighbors

In [47]:
data = load_boston()
Y=data.target
X=data.data
X_norm = scale(X, axis=0)

In [56]:
kfl = KFold(Y.size, n_folds=5, shuffle=True, random_state=42)
p_range = np.linspace(1,10,50)
scores = []
for i in p_range:
    knr = KNeighborsRegressor(n_neighbors=5, weights='distance', p=i)
    scores.append(cross_val_score(knr, X_norm, Y, cv=kfl, scoring='mean_squared_error').mean())

In [63]:
plt.plot(p_range, scores)
plt.xlabel('Value of p for KNR')
plt.ylabel('Cross-Validated MSE')
plt.show()

In [64]:
print "Minimum MSE for p=" + str(p_range[scores.index(max(scores))+1]) + " with error " + str(max(scores))

Minimum MSE for p=1.18367346939 with error -16.0502085084


# Нормализация признаков и перспетрон

In [90]:
data_test = np.genfromtxt("C:/Users/U_M0SLV/Downloads/perceptron-test.csv", delimiter=',')
data_train = np.genfromtxt("C:/Users/U_M0SLV/Downloads/perceptron-train.csv", delimiter=',')

In [92]:
Y_train = data_train[:,0]
X_train = data_train[:,1:]
Y_test = data_test[:,0]
X_test = data_test[:,1:]

In [101]:
clf = Perceptron(random_state=241)
clf.fit(X_train, Y_train)
clf.score(X_test, Y_test)

0.35999999999999999

In [102]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
clf1 = Perceptron(random_state=241)
clf1.fit(X_train_scaled, Y_train)
clf1.score(X_test_scaled, Y_test)

0.92500000000000004

In [103]:
print "Difference between accuracy on non-scaled and scaled features: " + str(clf1.score(X_test_scaled, Y_test)-clf.score(X_test, Y_test))

Difference between accuracy on non-scaled and scaled features: 0.565
