stress_tests/kaggle/kaggle14.py

import matplotlib

matplotlib.use("PS")
import modin.pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use("fivethirtyeight")
import warnings

warnings.filterwarnings("ignore")
data = pd.read_csv("train.csv")
data.head()
data.isnull().sum()  # checking for total null values
data.groupby(["Sex", "Survived"])["Survived"].count()
f, ax = plt.subplots(1, 2, figsize=(18, 8))
data[["Sex", "Survived"]].groupby(["Sex"]).mean().plot.bar(ax=ax[0])
ax[0].set_title("Survived vs Sex")
sns.countplot("Sex", hue="Survived", data=data, ax=ax[1])
ax[1].set_title("Sex:Survived vs Dead")
plt.show()
pd.crosstab(data.Pclass, data.Survived, margins=True).style.background_gradient(
    cmap="summer_r"
)
f, ax = plt.subplots(1, 2, figsize=(18, 8))
data["Pclass"].value_counts().plot.bar(
    color=["#CD7F32", "#FFDF00", "#D3D3D3"], ax=ax[0]
)
ax[0].set_title("Number Of Passengers By Pclass")
ax[0].set_ylabel("Count")
sns.countplot("Pclass", hue="Survived", data=data, ax=ax[1])
ax[1].set_title("Pclass:Survived vs Dead")
plt.show()
pd.crosstab(
    [data.Sex, data.Survived], data.Pclass, margins=True
).style.background_gradient(cmap="summer_r")
sns.factorplot("Pclass", "Survived", hue="Sex", data=data)
plt.show()
print("Oldest Passenger was of:", data["Age"].max(), "Years")
print("Youngest Passenger was of:", data["Age"].min(), "Years")
print("Average Age on the ship:", data["Age"].mean(), "Years")
f, ax = plt.subplots(1, 2, figsize=(18, 8))
sns.violinplot("Pclass", "Age", hue="Survived", data=data, split=True, ax=ax[0])
ax[0].set_title("Pclass and Age vs Survived")
ax[0].set_yticks(range(0, 110, 10))
sns.violinplot("Sex", "Age", hue="Survived", data=data, split=True, ax=ax[1])
ax[1].set_title("Sex and Age vs Survived")
ax[1].set_yticks(range(0, 110, 10))
plt.show()
data["Initial"] = 0
for i in data:
    data["Initial"] = data.Name.str.extract(
        r"([A-Za-z]+)\."  # noqa: W605
    )  # lets extract the Salutations
pd.crosstab(data.Initial, data.Sex).T.style.background_gradient(
    cmap="summer_r"
)  # Checking the Initials with the Sex
data["Initial"].replace(
    [
        "Mlle",
        "Mme",
        "Ms",
        "Dr",
        "Major",
        "Lady",
        "Countess",
        "Jonkheer",
        "Col",
        "Rev",
        "Capt",
        "Sir",
        "Don",
    ],
    [
        "Miss",
        "Miss",
        "Miss",
        "Mr",
        "Mr",
        "Mrs",
        "Mrs",
        "Other",
        "Other",
        "Other",
        "Mr",
        "Mr",
        "Mr",
    ],
    inplace=True,
)
data.groupby("Initial")["Age"].mean()  # lets check the average age by Initials
data.loc[(data.Age.isnull()) & (data.Initial == "Mr"), "Age"] = 33
data.loc[(data.Age.isnull()) & (data.Initial == "Mrs"), "Age"] = 36
data.loc[(data.Age.isnull()) & (data.Initial == "Master"), "Age"] = 5
data.loc[(data.Age.isnull()) & (data.Initial == "Miss"), "Age"] = 22
data.loc[(data.Age.isnull()) & (data.Initial == "Other"), "Age"] = 46
data.Age.isnull().any()  # So no null values left finally
f, ax = plt.subplots(1, 2, figsize=(20, 10))
data[data["Survived"] == 0].Age.plot.hist(
    ax=ax[0], bins=20, edgecolor="black", color="red"
)
ax[0].set_title("Survived= 0")
x1 = list(range(0, 85, 5))
ax[0].set_xticks(x1)
data[data["Survived"] == 1].Age.plot.hist(
    ax=ax[1], color="green", bins=20, edgecolor="black"
)
ax[1].set_title("Survived= 1")
x2 = list(range(0, 85, 5))
ax[1].set_xticks(x2)
plt.show()
sns.factorplot("Pclass", "Survived", col="Initial", data=data)
plt.show()
pd.crosstab(
    [data.Embarked, data.Pclass], [data.Sex, data.Survived], margins=True
).style.background_gradient(cmap="summer_r")
sns.factorplot("Embarked", "Survived", data=data)
fig = plt.gcf()
fig.set_size_inches(5, 3)
plt.show()
f, ax = plt.subplots(2, 2, figsize=(20, 15))
sns.countplot("Embarked", data=data, ax=ax[0, 0])
ax[0, 0].set_title("No. Of Passengers Boarded")
sns.countplot("Embarked", hue="Sex", data=data, ax=ax[0, 1])
ax[0, 1].set_title("Male-Female Split for Embarked")
sns.countplot("Embarked", hue="Survived", data=data, ax=ax[1, 0])
ax[1, 0].set_title("Embarked vs Survived")
sns.countplot("Embarked", hue="Pclass", data=data, ax=ax[1, 1])
ax[1, 1].set_title("Embarked vs Pclass")
plt.subplots_adjust(wspace=0.2, hspace=0.5)
plt.show()
sns.factorplot("Pclass", "Survived", hue="Sex", col="Embarked", data=data)
plt.show()
data["Embarked"].fillna("S", inplace=True)
data.Embarked.isnull().any()  # Finally No NaN values
pd.crosstab([data.SibSp], data.Survived).style.background_gradient(cmap="summer_r")
f, ax = plt.subplots(1, 2, figsize=(20, 8))
sns.barplot("SibSp", "Survived", data=data, ax=ax[0])
ax[0].set_title("SibSp vs Survived")
sns.factorplot("SibSp", "Survived", data=data, ax=ax[1])
ax[1].set_title("SibSp vs Survived")
plt.close(2)
plt.show()
pd.crosstab(data.SibSp, data.Pclass).style.background_gradient(cmap="summer_r")
pd.crosstab(data.Parch, data.Pclass).style.background_gradient(cmap="summer_r")
f, ax = plt.subplots(1, 2, figsize=(20, 8))
sns.barplot("Parch", "Survived", data=data, ax=ax[0])
ax[0].set_title("Parch vs Survived")
sns.factorplot("Parch", "Survived", data=data, ax=ax[1])
ax[1].set_title("Parch vs Survived")
plt.close(2)
plt.show()
print("Highest Fare was:", data["Fare"].max())
print("Lowest Fare was:", data["Fare"].min())
print("Average Fare was:", data["Fare"].mean())
f, ax = plt.subplots(1, 3, figsize=(20, 8))
sns.distplot(data[data["Pclass"] == 1].Fare, ax=ax[0])
ax[0].set_title("Fares in Pclass 1")
sns.distplot(data[data["Pclass"] == 2].Fare, ax=ax[1])
ax[1].set_title("Fares in Pclass 2")
sns.distplot(data[data["Pclass"] == 3].Fare, ax=ax[2])
ax[2].set_title("Fares in Pclass 3")
plt.show()
sns.heatmap(
    data.corr(), annot=True, cmap="RdYlGn", linewidths=0.2
)  # data.corr()-->correlation matrix
fig = plt.gcf()
fig.set_size_inches(10, 8)
plt.show()
data["Age_band"] = 0
data.loc[data["Age"] <= 16, "Age_band"] = 0
data.loc[(data["Age"] > 16) & (data["Age"] <= 32), "Age_band"] = 1
data.loc[(data["Age"] > 32) & (data["Age"] <= 48), "Age_band"] = 2
data.loc[(data["Age"] > 48) & (data["Age"] <= 64), "Age_band"] = 3
data.loc[data["Age"] > 64, "Age_band"] = 4
data.head(2)
data["Age_band"].value_counts().to_frame().style.background_gradient(
    cmap="summer"
)  # checking the number of passenegers in each band
sns.factorplot("Age_band", "Survived", data=data, col="Pclass")
plt.show()
data["Family_Size"] = 0
data["Family_Size"] = data["Parch"] + data["SibSp"]  # family size
data["Alone"] = 0
data.loc[data.Family_Size == 0, "Alone"] = 1  # Alone
f, ax = plt.subplots(1, 2, figsize=(18, 6))
sns.factorplot("Family_Size", "Survived", data=data, ax=ax[0])
ax[0].set_title("Family_Size vs Survived")
sns.factorplot("Alone", "Survived", data=data, ax=ax[1])
ax[1].set_title("Alone vs Survived")
plt.close(2)
plt.close(3)
plt.show()
sns.factorplot("Alone", "Survived", data=data, hue="Sex", col="Pclass")
plt.show()
data["Fare_Range"] = pd.qcut(data["Fare"], 4)
data.groupby(["Fare_Range"])["Survived"].mean().to_frame().style.background_gradient(
    cmap="summer_r"
)
data["Fare_cat"] = 0
data.loc[data["Fare"] <= 7.91, "Fare_cat"] = 0
data.loc[(data["Fare"] > 7.91) & (data["Fare"] <= 14.454), "Fare_cat"] = 1
data.loc[(data["Fare"] > 14.454) & (data["Fare"] <= 31), "Fare_cat"] = 2
data.loc[(data["Fare"] > 31) & (data["Fare"] <= 513), "Fare_cat"] = 3
sns.factorplot("Fare_cat", "Survived", data=data, hue="Sex")
plt.show()
data["Sex"].replace(["male", "female"], [0, 1], inplace=True)
data["Embarked"].replace(["S", "C", "Q"], [0, 1, 2], inplace=True)
data["Initial"].replace(
    ["Mr", "Mrs", "Miss", "Master", "Other"], [0, 1, 2, 3, 4], inplace=True
)
data.drop(
    ["Name", "Age", "Ticket", "Fare", "Cabin", "Fare_Range", "PassengerId"],
    axis=1,
    inplace=True,
)
sns.heatmap(
    data.corr(), annot=True, cmap="RdYlGn", linewidths=0.2, annot_kws={"size": 20}
)
fig = plt.gcf()
fig.set_size_inches(18, 15)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.show()
from sklearn.linear_model import LogisticRegression  # logistic regression
from sklearn import svm  # support vector Machine
from sklearn.ensemble import RandomForestClassifier  # Random Forest
from sklearn.neighbors import KNeighborsClassifier  # KNN
from sklearn.naive_bayes import GaussianNB  # Naive bayes
from sklearn.tree import DecisionTreeClassifier  # Decision Tree
from sklearn.model_selection import train_test_split  # training and testing data split
from sklearn import metrics  # accuracy measure
from sklearn.metrics import confusion_matrix  # for confusion matrix

train, test = train_test_split(
    data, test_size=0.3, random_state=0, stratify=data["Survived"]
)
train_X = train[train.columns[1:]]
train_Y = train[train.columns[:1]]
test_X = test[test.columns[1:]]
test_Y = test[test.columns[:1]]
X = data[data.columns[1:]]
Y = data["Survived"]
model = svm.SVC(kernel="rbf", C=1, gamma=0.1)
model.fit(train_X, train_Y)
prediction1 = model.predict(test_X)
print("Accuracy for rbf SVM is ", metrics.accuracy_score(prediction1, test_Y))
model = svm.SVC(kernel="linear", C=0.1, gamma=0.1)
model.fit(train_X, train_Y)
prediction2 = model.predict(test_X)
print("Accuracy for linear SVM is", metrics.accuracy_score(prediction2, test_Y))
model = LogisticRegression()
model.fit(train_X, train_Y)
prediction3 = model.predict(test_X)
print(
    "The accuracy of the Logistic Regression is",
    metrics.accuracy_score(prediction3, test_Y),
)
model = DecisionTreeClassifier()
model.fit(train_X, train_Y)
prediction4 = model.predict(test_X)
print(
    "The accuracy of the Decision Tree is", metrics.accuracy_score(prediction4, test_Y)
)
model = KNeighborsClassifier()
model.fit(train_X, train_Y)
prediction5 = model.predict(test_X)
print("The accuracy of the KNN is", metrics.accuracy_score(prediction5, test_Y))
a_index = list(range(1, 11))
a = pd.Series()
x = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
for i in list(range(1, 11)):
    model = KNeighborsClassifier(n_neighbors=i)
    model.fit(train_X, train_Y)
    prediction = model.predict(test_X)
    a = a.append(pd.Series(metrics.accuracy_score(prediction, test_Y)))
plt.plot(a_index, a)
plt.xticks(x)
fig = plt.gcf()
fig.set_size_inches(12, 6)
plt.show()
print(
    "Accuracies for different values of n are:",
    a.values,
    "with the max value as ",
    a.values.max(),
)
model = GaussianNB()
model.fit(train_X, train_Y)
prediction6 = model.predict(test_X)
print("The accuracy of the NaiveBayes is", metrics.accuracy_score(prediction6, test_Y))
model = RandomForestClassifier(n_estimators=100)
model.fit(train_X, train_Y)
prediction7 = model.predict(test_X)
print(
    "The accuracy of the Random Forests is", metrics.accuracy_score(prediction7, test_Y)
)
from sklearn.model_selection import KFold  # for K-fold cross validation
from sklearn.model_selection import cross_val_score  # score evaluation
from sklearn.model_selection import cross_val_predict  # prediction

kfold = KFold(n_splits=10, random_state=22)  # k=10, split the data into 10 equal parts
xyz = []
accuracy = []
std = []
classifiers = [
    "Linear Svm",
    "Radial Svm",
    "Logistic Regression",
    "KNN",
    "Decision Tree",
    "Naive Bayes",
    "Random Forest",
]
models = [
    svm.SVC(kernel="linear"),
    svm.SVC(kernel="rbf"),
    LogisticRegression(),
    KNeighborsClassifier(n_neighbors=9),
    DecisionTreeClassifier(),
    GaussianNB(),
    RandomForestClassifier(n_estimators=100),
]
for i in models:
    model = i
    cv_result = cross_val_score(model, X, Y, cv=kfold, scoring="accuracy")
    xyz.append(cv_result.mean())
    std.append(cv_result.std())
    accuracy.append(cv_result)
new_models_dataframe2 = pd.DataFrame({"CV Mean": xyz, "Std": std}, index=classifiers)
new_models_dataframe2
plt.subplots(figsize=(12, 6))
box = pd.DataFrame(accuracy, index=[classifiers])
box.T.boxplot()
new_models_dataframe2["CV Mean"].plot.barh(width=0.8)
plt.title("Average CV Mean Accuracy")
fig = plt.gcf()
fig.set_size_inches(8, 5)
plt.show()
f, ax = plt.subplots(3, 3, figsize=(12, 10))
y_pred = cross_val_predict(svm.SVC(kernel="rbf"), X, Y, cv=10)
sns.heatmap(confusion_matrix(Y, y_pred), ax=ax[0, 0], annot=True, fmt="2.0f")
ax[0, 0].set_title("Matrix for rbf-SVM")
y_pred = cross_val_predict(svm.SVC(kernel="linear"), X, Y, cv=10)
sns.heatmap(confusion_matrix(Y, y_pred), ax=ax[0, 1], annot=True, fmt="2.0f")
ax[0, 1].set_title("Matrix for Linear-SVM")
y_pred = cross_val_predict(KNeighborsClassifier(n_neighbors=9), X, Y, cv=10)
sns.heatmap(confusion_matrix(Y, y_pred), ax=ax[0, 2], annot=True, fmt="2.0f")
ax[0, 2].set_title("Matrix for KNN")
y_pred = cross_val_predict(RandomForestClassifier(n_estimators=100), X, Y, cv=10)
sns.heatmap(confusion_matrix(Y, y_pred), ax=ax[1, 0], annot=True, fmt="2.0f")
ax[1, 0].set_title("Matrix for Random-Forests")
y_pred = cross_val_predict(LogisticRegression(), X, Y, cv=10)
sns.heatmap(confusion_matrix(Y, y_pred), ax=ax[1, 1], annot=True, fmt="2.0f")
ax[1, 1].set_title("Matrix for Logistic Regression")
y_pred = cross_val_predict(DecisionTreeClassifier(), X, Y, cv=10)
sns.heatmap(confusion_matrix(Y, y_pred), ax=ax[1, 2], annot=True, fmt="2.0f")
ax[1, 2].set_title("Matrix for Decision Tree")
y_pred = cross_val_predict(GaussianNB(), X, Y, cv=10)
sns.heatmap(confusion_matrix(Y, y_pred), ax=ax[2, 0], annot=True, fmt="2.0f")
ax[2, 0].set_title("Matrix for Naive Bayes")
plt.subplots_adjust(hspace=0.2, wspace=0.2)
plt.show()
from sklearn.model_selection import GridSearchCV

C = [0.05, 0.1, 0.2, 0.3, 0.25, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
gamma = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
kernel = ["rbf", "linear"]
hyper = {"kernel": kernel, "C": C, "gamma": gamma}
gd = GridSearchCV(estimator=svm.SVC(), param_grid=hyper, verbose=True)
gd.fit(X, Y)
print(gd.best_score_)
print(gd.best_estimator_)
n_estimators = range(100, 1000, 100)
hyper = {"n_estimators": n_estimators}
gd = GridSearchCV(
    estimator=RandomForestClassifier(random_state=0), param_grid=hyper, verbose=True
)
gd.fit(X, Y)
print(gd.best_score_)
print(gd.best_estimator_)
from sklearn.ensemble import VotingClassifier

ensemble_lin_rbf = VotingClassifier(
    estimators=[
        ("KNN", KNeighborsClassifier(n_neighbors=10)),
        ("RBF", svm.SVC(probability=True, kernel="rbf", C=0.5, gamma=0.1)),
        ("RFor", RandomForestClassifier(n_estimators=500, random_state=0)),
        ("LR", LogisticRegression(C=0.05)),
        ("DT", DecisionTreeClassifier(random_state=0)),
        ("NB", GaussianNB()),
        ("svm", svm.SVC(kernel="linear", probability=True)),
    ],
    voting="soft",
).fit(train_X, train_Y)
print("The accuracy for ensembled model is:", ensemble_lin_rbf.score(test_X, test_Y))
cross = cross_val_score(ensemble_lin_rbf, X, Y, cv=10, scoring="accuracy")
print("The cross validated score is", cross.mean())
from sklearn.ensemble import BaggingClassifier

model = BaggingClassifier(
    base_estimator=KNeighborsClassifier(n_neighbors=3), random_state=0, n_estimators=700
)
model.fit(train_X, train_Y)
prediction = model.predict(test_X)
print("The accuracy for bagged KNN is:", metrics.accuracy_score(prediction, test_Y))
result = cross_val_score(model, X, Y, cv=10, scoring="accuracy")
print("The cross validated score for bagged KNN is:", result.mean())
model = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(), random_state=0, n_estimators=100
)
model.fit(train_X, train_Y)
prediction = model.predict(test_X)
print(
    "The accuracy for bagged Decision Tree is:",
    metrics.accuracy_score(prediction, test_Y),
)
result = cross_val_score(model, X, Y, cv=10, scoring="accuracy")
print("The cross validated score for bagged Decision Tree is:", result.mean())
from sklearn.ensemble import AdaBoostClassifier

ada = AdaBoostClassifier(n_estimators=200, random_state=0, learning_rate=0.1)
result = cross_val_score(ada, X, Y, cv=10, scoring="accuracy")
print("The cross validated score for AdaBoost is:", result.mean())
from sklearn.ensemble import GradientBoostingClassifier

grad = GradientBoostingClassifier(n_estimators=500, random_state=0, learning_rate=0.1)
result = cross_val_score(grad, X, Y, cv=10, scoring="accuracy")
print("The cross validated score for Gradient Boosting is:", result.mean())
import xgboost as xg

xgboost = xg.XGBClassifier(n_estimators=900, learning_rate=0.1)
result = cross_val_score(xgboost, X, Y, cv=10, scoring="accuracy")
print("The cross validated score for XGBoost is:", result.mean())
n_estimators = list(range(100, 1100, 100))
learn_rate = [0.05, 0.1, 0.2, 0.3, 0.25, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
hyper = {"n_estimators": n_estimators, "learning_rate": learn_rate}
gd = GridSearchCV(estimator=AdaBoostClassifier(), param_grid=hyper, verbose=True)
gd.fit(X, Y)
print(gd.best_score_)
print(gd.best_estimator_)
ada = AdaBoostClassifier(n_estimators=200, random_state=0, learning_rate=0.05)
result = cross_val_predict(ada, X, Y, cv=10)
sns.heatmap(confusion_matrix(Y, result), cmap="winter", annot=True, fmt="2.0f")
plt.show()
f, ax = plt.subplots(2, 2, figsize=(15, 12))
model = RandomForestClassifier(n_estimators=500, random_state=0)
model.fit(X, Y)
pd.Series(model.feature_importances_, X.columns).sort_values(ascending=True).plot.barh(
    width=0.8, ax=ax[0, 0]
)
ax[0, 0].set_title("Feature Importance in Random Forests")
model = AdaBoostClassifier(n_estimators=200, learning_rate=0.05, random_state=0)
model.fit(X, Y)
pd.Series(model.feature_importances_, X.columns).sort_values(ascending=True).plot.barh(
    width=0.8, ax=ax[0, 1], color="#ddff11"
)
ax[0, 1].set_title("Feature Importance in AdaBoost")
model = GradientBoostingClassifier(n_estimators=500, learning_rate=0.1, random_state=0)
model.fit(X, Y)
pd.Series(model.feature_importances_, X.columns).sort_values(ascending=True).plot.barh(
    width=0.8, ax=ax[1, 0], cmap="RdYlGn_r"
)
ax[1, 0].set_title("Feature Importance in Gradient Boosting")
model = xg.XGBClassifier(n_estimators=900, learning_rate=0.1)
model.fit(X, Y)
pd.Series(model.feature_importances_, X.columns).sort_values(ascending=True).plot.barh(
    width=0.8, ax=ax[1, 1], color="#FD0F00"
)
ax[1, 1].set_title("Feature Importance in XgBoost")
plt.show()