In [51]:
import csv
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
import sklearn.metrics as skm
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import graphviz

# with open('train.csv') as csvData:
#     temp=list(csv.DictReader(csvData))
# train=pd.DataFrame(temp)

train=pd.read_csv("train.csv")
test=pd.read_csv("test.csv")
test.head()
train.head()

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
0,1,2596,51,3,258,0,510,221,232,148,...,0,0,0,0,0,0,0,0,0,5
1,2,2590,56,2,212,-6,390,220,235,151,...,0,0,0,0,0,0,0,0,0,5
2,3,2804,139,9,268,65,3180,234,238,135,...,0,0,0,0,0,0,0,0,0,2
3,4,2785,155,18,242,118,3090,238,238,122,...,0,0,0,0,0,0,0,0,0,2
4,5,2595,45,2,153,-1,391,220,234,150,...,0,0,0,0,0,0,0,0,0,5


In [32]:
X_train=train.drop(["Cover_Type","Id"], axis=1)
Y_train=train["Cover_Type"]
X_test=test.drop("Id", axis=1)

LR=LogisticRegression(random_state=100,multi_class='ovr')
LR.fit(X_train,Y_train)
Y_train_predicted=LR.predict(X_train)

In [33]:
accuracy_score=skm.accuracy_score(Y_train,Y_train_predicted)
print("Accuracy Score",accuracy_score)

f1_score=skm.f1_score(Y_train,Y_train_predicted,average='weighted')
print("F1 Score      ",f1_score)

Accuracy Score 0.675132275132
F1 Score       0.670514477523


In [34]:
X_test=test.drop(["Id"],axis=1)
Y_test_predicted=LR.predict(X_test)

In [35]:
np.unique(Y_test_predicted)

array([1, 2, 3, 4, 5, 6, 7])

In [38]:
skm.confusion_matrix(Y_train,Y_train_predicted)

array([[1357,  355,    6,    0,  155,   18,  269],
       [ 472, 1044,   64,    0,  469,   85,   26],
       [   0,    8, 1151,  309,   92,  598,    2],
       [   0,    0,  148, 1909,    0,  103,    0],
       [  48,  227,  183,    0, 1595,  103,    4],
       [   2,   56,  446,  205,  215, 1235,    1],
       [ 211,   16,    7,    0,    9,    0, 1917]])

In [21]:
m=zip(test["Id"],Y_test_predicted)
submission=pd.DataFrame(data=list(m),columns=["Id","Cover_Type"])
submission.to_csv("submission.csv",index=False)

### Cross Validation

#### Basic Logistic

In [22]:
from sklearn.model_selection import KFold,cross_val_score

In [23]:
local_k_fold=KFold(n_splits=5,shuffle=True,random_state=0)

results=cross_val_score(LogisticRegression(),X_train,Y_train,cv=5,scoring="accuracy")

In [24]:
print(results)
print(results.mean())

[ 0.54662698  0.56746032  0.62830688  0.63723545  0.67526455]
0.610978835979


### Multinomial

#### Multinomial is not better than  One Vs Rest
### Increasing the number Iteration doesn't improve the accuracy 

In [43]:
#results=cross_val_score(LogisticRegression(solver='sag'),X_train,Y_train,cv=5,scoring="accuracy")
#results=cross_val_score(LogisticRegression(solver='liblinear',max_iter=1000),X_train,Y_train,cv=5,scoring="accuracy")
#results=cross_val_score(LogisticRegression(solver='lbfgs',max_iter=1000),X_train,Y_train,cv=5,scoring="accuracy")
#results=cross_val_score(LogisticRegression(multi_class="multinomial",solver='sag'),X_train,Y_train,cv=5,scoring="accuracy")
#results=cross_val_score(LogisticRegression(multi_class="multinomial",solver='lbfgs'),X_train,Y_train,cv=5,scoring="accuracy")

print(results)
print(results.mean())

[ 0.49867725  0.42857143  0.40839947  0.43353175  0.47585979]
0.449007936508


### Standardization/Logit/Pipeline

In [48]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

standardizer = StandardScaler()
logit = LogisticRegression(random_state=100,max_iter=1000)
pipeline = make_pipeline(standardizer,logit)

results=cross_val_score(pipeline,X_train,Y_train, cv=5, n_jobs=2, scoring="accuracy")
results

array([ 0.56812169,  0.56051587,  0.64252646,  0.65376984,  0.6792328 ])

In [49]:
results.mean()

0.62083333333333335

### Support Vector Machine

In [52]:
standardizer = StandardScaler()
svm = SVC(kernel='rbf')
# [ 0.61111111  0.6167328   0.66898148  0.69246032  0.74041005]
# 0.66593915343915344
# svm = SVC(kernel='linear')
# 0.563161375661 0.736111111111 0.657142857143
# svm = SVC(kernel='poly')
# 0.597883597884 0.714947089947 0.644113756614
# svm = SVC(kernel='sigmoid')
# 0.597883597884 0.714947089947 0.644113756614
pipeline = make_pipeline(standardizer,svm)

In [53]:
results=cross_val_score(pipeline,X_train,Y_train, cv=5, n_jobs=2, scoring="accuracy")
print(results.min(),results.max(),results.mean())

0.611111111111 0.74041005291 0.665939153439


In [None]:
pipeline.fit(X_train,Y_train)
Y_train_predicted=pipeline.predict(X_train)

In [None]:
svm = SVC(kernel='rbf')
svm.fit(X_train,Y_train)
Y_test_predicted=pd.Series([0]*565892)
#Y_test_predicted = svm.predict(X_test.iloc[0:100000])
for i in range(1,565893,24604):
#for i in range(1,101,5):
    print(i)
    Y_test_predicted.iloc[(i-1):i+24603]=pd.Series(svm.predict(X_test.iloc[(i-1):i+24603]))
#len(Y_test_predicted)
#Y_test_predicted
Y_test_predicted.value_counts()

In [None]:
svm = SVC(kernel='rbf')
svm.fit(X_train,Y_train)
Y_test_predicted=svm.predict(X_test)
Y_test_predicted.value_counts()

In [None]:
confusion_matrix(Y_train,Y_train_predicted)

In [None]:
for i in range(1,21,4):
    print(range((i-1),i+3))

In [None]:
Y_test_predicted=pd.Series()

In [None]:
svm = SVC(kernel='rbf')
svm.fit(X_train,Y_train)
Y_test_predicted=pd.Series([0]*565892)
#Y_test_predicted = svm.predict(X_test.iloc[0:100000])
for i in range(1,565893,24604):
#for i in range(1,101,5):
    Y_test_predicted.iloc[(i-1):i+24603]=pd.Series(svm.predict(X_test.iloc[(i-1):i+24603]))
len(Y_test_predicted)
Y_test_predicted

# Decision Trees

In [76]:
import numpy as np
import pandas as pd
import sklearn.metrics as skm
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier,export_graphviz
from sklearn.model_selection import cross_val_score


train=pd.read_csv("train.csv")
test=pd.read_csv("test.csv")

X_train=train.drop(["Cover_Type","Id"], axis=1)
Y_train=train["Cover_Type"]
X_test=test.drop("Id", axis=1)

dtc=DecisionTreeClassifier()
dtc.fit(X_train,Y_train)
Y_train_pred=dtc.predict(X_train)

skm.confusion_matrix(Y_train,Y_train_pred)

array([[2160,    0,    0,    0,    0,    0,    0],
       [   0, 2160,    0,    0,    0,    0,    0],
       [   0,    0, 2160,    0,    0,    0,    0],
       [   0,    0,    0, 2160,    0,    0,    0],
       [   0,    0,    0,    0, 2160,    0,    0],
       [   0,    0,    0,    0,    0, 2160,    0],
       [   0,    0,    0,    0,    0,    0, 2160]])

In [77]:
results=cross_val_score(DecisionTreeClassifier(),X_train,Y_train,cv=5)
print(results)
results.mean()

[ 0.69246032  0.69742063  0.6875      0.67956349  0.77149471]


0.7056878306878307

In [78]:
skm.accuracy_score(Y_train,Y_train_pred)

1.0

# Random Forrest

In [80]:
import numpy as np
import pandas as pd
import sklearn.metrics as skm
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score


train=pd.read_csv("train.csv")
test=pd.read_csv("test.csv")

X_train=train.drop(["Cover_Type","Id"], axis=1)
Y_train=train["Cover_Type"]
X_test=test.drop("Id", axis=1)

rfc=RandomForestClassifier(n_estimators=10,n_jobs=3)
rfc.fit(X_train,Y_train)
Y_train_pred=rfc.predict(X_train)

skm.confusion_matrix(Y_train,Y_train_pred)

array([[2148,    7,    0,    0,    0,    0,    5],
       [  20, 2131,    2,    0,    3,    4,    0],
       [   0,    2, 2148,    2,    0,    8,    0],
       [   0,    0,    3, 2157,    0,    0,    0],
       [   0,    3,    0,    0, 2155,    2,    0],
       [   0,    0,   13,    9,    1, 2137,    0],
       [   0,    0,    0,    0,    0,    0, 2160]])

In [81]:
skm.accuracy_score(Y_train,Y_train_pred)

0.99444444444444446

In [89]:
rfc=RandomForestClassifier(n_estimators=60,n_jobs=3)
results=cross_val_score(rfc,X_train,Y_train,cv=5)
print(results)
results.mean()

[ 0.75595238  0.74239418  0.76554233  0.79794974  0.84623016]


0.78161375661375654