# Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns

from sklearn import preprocessing
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn import metrics

In [3]:
train = pd.read_csv('../Data/train.csv')

# Analysing Dataset

In [None]:
train.head()

In [None]:
print(train.shape)
print(train.dtypes)
print(train.columns)

In [None]:
train.describe()

## Finding correlations

In [None]:
corr = train.corr()
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(corr, cmap='PiYG',
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values)

In [None]:
correlation = train.corr(method='pearson')
columns = correlation.nlargest(25, 'bestSoldierPerc').index
fig, ax = plt.subplots(figsize=(15,15))
correlation_map = np.corrcoef(train[columns].values.T)
sns.set(font_scale=1.0)
heatmap = sns.heatmap(correlation_map, cbar=True, annot=True, square=True, fmt='.2f', yticklabels=columns.values, xticklabels=columns.values)

plt.show()

## Dropping columns

In [None]:
train.drop(['shipId','attackId','swimmingDistance','killPoints','killingStreaks','friendlyKills','horseRideKills', 'numShips','killPoints','castleTowerDestroys'], axis=1, inplace=True)

## Checking for NaN's

In [None]:
train.isna().sum()

## Creating the model

In [None]:
#Extracting bestSoldierPerc as Y
Y=train.iloc[:,len(train.columns)-1]

In [None]:
#Extracting everything except bestSoldierPerc as X
X=train.iloc[:,:-1]

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [None]:
names=train.columns

In [None]:
lm=linear_model.LinearRegression()

In [None]:
model = lm.fit(X_train, y_train)
model.score(X_test, y_test)

In [None]:
#Extracting everything except bestSoldierPerc as X
X=train.iloc[:,:-1]
#Extracting bestSoldierPerc as Y
Y=train.iloc[:,len(train.columns)-1]

In [None]:
model=lm.fit(X,Y)
scores = cross_val_score(model, X, Y, cv=6)
predictions = cross_val_predict(model, X, Y, cv=6)

In [None]:
print("Mean Absolute error")
metrics.mean_absolute_error(Y, predictions)

# Trial

In [None]:
Y=train['bestSoldierPerc']

In [None]:
X=train.drop(['bestSoldierPerc','attackId','shipId','soldierId','friendlyKills','killingStreaks'], axis=1)

In [None]:
X1=train.drop(['bestSoldierPerc','attackId','shipId','soldierId'], axis=1)

#### Model for X

In [None]:
model=lm.fit(X,Y)
# scores = cross_val_score(model, X, Y, cv=6)
predictions = cross_val_predict(model, X, Y, cv=6)

In [None]:
print("Mean Absolute error")
metrics.mean_absolute_error(Y, predictions)

#### Model for X1

In [None]:
model=lm.fit(X1,Y)
scores = cross_val_score(model, X, Y, cv=6)
predictions1 = cross_val_predict(model, X1, Y, cv=10)

In [None]:
print("Mean Absolute error")
metrics.mean_absolute_error(Y, predictions1)

# Finding best Regressor

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import KFold

In [5]:
Y=train['bestSoldierPerc']

In [6]:
X = train.drop(['shipId','attackId','bestSoldierPerc','soldierId','friendlyKills','killRank'], axis=1)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.7)

In [8]:
pipelines = []
# pipelines.append(('ScaledLR', Pipeline([('Scaler', StandardScaler()),('LR',LinearRegression())])))
# pipelines.append(('ScaledLASSO', Pipeline([('Scaler', StandardScaler()),('LASSO', Lasso())])))
# pipelines.append(('ScaledEN', Pipeline([('Scaler', StandardScaler()),('EN', ElasticNet())])))
# pipelines.append(('ScaledKNN', Pipeline([('Scaler', StandardScaler()),('KNN', KNeighborsRegressor())])))
# pipelines.append(('ScaledCART', Pipeline([('Scaler', StandardScaler()),('CART', DecisionTreeRegressor())])))
pipelines.append(('ScaledGBM', Pipeline([('Scaler', StandardScaler()),('GBM', GradientBoostingRegressor())])))

results = []
names = []
for name, model in pipelines:
    kfold = KFold(n_splits=5, random_state=21)
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='neg_mean_absolute_error')
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

ScaledGBM: -0.098865 (0.000104)


In [None]:
from sklearn.model_selection import GridSearchCV

scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
param_grid = dict(n_estimators=np.array([50,100,200,300,400]))
model = GradientBoostingRegressor(random_state=21)
kfold = KFold(n_splits=5, random_state=21)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_mean_absolute_error', cv=kfold)
grid_result = grid.fit(rescaledX, y_train)

means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))