# Logistic Regression 

Statistical method for analyzing datasets in which there are one or more independent variables that determine an outcome. The outcome is measured with a dichotomous variable (in which there are only two possible outcomes). It is used to predict the probability of a binary event occurring, such as pass/fail, win/lose, alive/dead.

**Importing the Libraries**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
df = sns.load_dataset("titanic")

## Perform EDA 
Here are some of the fetures of the dataset 

* `Survived`: A binary variable indicating whether the passenger survived (1 = Yes, 0 = No).
* `Pclass`: Passenger class (1 = 1st, 2 = 2nd, 3 = 3rd).
* `Sex`: Gender of the passenger.
* `Age`: Age of the passenger.
* `SibSp`: Number of siblings/spouses aboard.
* `Parch`: Number of parents/children aboard.
* `Fare`: Passenger fare.
* `Embarked`: Port of embarkation (C = Cherbourg, Q = Queenstown, S = Southampton)



In [None]:
df.info()

In [None]:
df.head()


Lets take a look at a few features 

In [None]:
plt.figure(figsize = (10, 6))
sns.distplot(df["age"], color = "darkred")

In [None]:
plt.figure(figsize = (10, 6))
sns.distplot(df["fare"], color = "darkred")

In [None]:
plt.figure(figsize = (10, 6))
sns.histplot(df["who"], color = "darkred")

Lets take a look at the children 

In [None]:
print(df["sex"].unique())
print(df["who"].unique())

In [None]:
children = df[df['who'] == 'child']
male_children = children[children['sex'] == "male" ] 
female_children = children[children['sex'] == "female"]  
print("Male Chilren: \n")
print(male_children.isna().sum())
print("\nFemale Chilren: \n")


In [None]:
print(female_children.isna().sum())

In [None]:
plt.figure(figsize=(12, 6))


plt.subplot(1, 2, 1)
sns.histplot(male_children['age'], kde=True, color='blue')
plt.title('Age Distribution of Male Children')


plt.subplot(1, 2, 2)
sns.histplot(female_children['age'], kde=True, color='pink')
plt.title('Age Distribution of Female Children')

plt.tight_layout()
plt.show()


In [None]:
df.info()

Lets clean up the Deck as we have a over 600 missing datapoints. 

In [None]:
sns.set_style("darkgrid")
sns.countplot(x = "deck", data = df, palette = "viridis")

In [None]:
sns.set_style("darkgrid")
sns.countplot(x = "deck", hue= "survived", data = df, palette = "viridis")

In [None]:
#Looking at the data above it is safe to say that the deck column is not useful for our analysis.
df.drop(["deck"], axis = 1, inplace = True)

What does this now tell us?
we now know that all children have age. but we are still missing age for the rest of the data so lets add that in now. 

In [None]:
df.isna().sum()

In [None]:
df.head()

In [None]:
#while this way is acceptable I do not think its the best way. 
average_age = df['age'].median()

average_age_male = df[df['sex'] == 'male']['age'].mean()
average_age_female = df[df['sex'] == 'female']['age'].mean()
print(f"Average age: {average_age}, Average male age: {int(average_age_male)}, Average female age: {int(average_age_female)}")

# Apply the average age to missing values for male and female passengers
df.loc[(df['sex'] == 'male') & (df['age'].isnull()), 'age'] = average_age_male
df.loc[(df['sex'] == 'female') & (df['age'].isnull()), 'age'] = average_age_female

# Print the updated DataFrame to verify changes
print(df.head())


In [None]:
plt.figure(figsize=(12, 6))


plt.subplot(1, 2, 1)

sns.boxplot(x='pclass', y='age', data=df)
plt.title('Box Plot of Age by Pclass')



plt.subplot(1, 2, 2)

sns.boxplot(x='sex', y='age', data=df)
plt.title('Box Plot of Age by Sex')


plt.tight_layout()
plt.show()


In [None]:
average_age_male_C1 = df[(df['pclass'] == 1)& (df['sex']=="male")]['age'].mean()
average_age_male_C2 = df[(df['pclass'] == 2)& (df['sex']=="male")]['age'].mean()
average_age_male_C3 = df[(df['pclass'] == 3)& (df['sex']=="male")]['age'].mean()
print(average_age_male_C1)
print(average_age_male_C2)
print(average_age_male_C3)

In [None]:
average_age_female_C1 = df[(df['pclass'] == 1)& (df['sex']=="female")]['age'].mean()
average_age_female_C2 = df[(df['pclass'] == 2)& (df['sex']=="female")]['age'].mean()
average_age_female_C3 = df[(df['pclass'] == 3)& (df['sex']=="female")]['age'].mean()
print(average_age_female_C1)
print(average_age_female_C2)
print(average_age_female_C3)

In [None]:

df.loc[(df['sex'] == 'male') & (df['pclass'] == 1) & (df['age'].isnull()), 'age'] = average_age_male_C1
df.loc[(df['sex'] == 'male') & (df['pclass'] == 2) & (df['age'].isnull()), 'age'] = average_age_male_C2
df.loc[(df['sex'] == 'male') & (df['pclass'] == 3) & (df['age'].isnull()), 'age'] = average_age_male_C3


df.loc[(df['sex'] == 'female') & (df['pclass'] == 1) & (df['age'].isnull()), 'age'] = average_age_female_C1
df.loc[(df['sex'] == 'female') & (df['pclass'] == 2) & (df['age'].isnull()), 'age'] = average_age_female_C2
df.loc[(df['sex'] == 'female') & (df['pclass'] == 3) & (df['age'].isnull()), 'age'] = average_age_female_C3



In [None]:
null_num = []
for i in df.columns:
    x = df[i].isnull().sum()
    null_num.append(x)
    
pd.DataFrame(null_num, index = df.columns, columns = ["Total Missing Values"])

In [None]:
df.dropna(inplace = True)

In [None]:
null_num = []
for i in df.columns:
    x = df[i].isnull().sum()
    null_num.append(x)
    
pd.DataFrame(null_num, index = df.columns, columns = ["Total Missing Values"])

In [None]:
sns.set_style("darkgrid")
sns.countplot(x = "survived", data = df, palette = "viridis")

In [None]:
df.survived.value_counts()

In [None]:
sns.set_style("darkgrid")
sns.countplot(x = "survived", hue= "alone", data = df, palette = "viridis")

In [None]:
sns.set_style("darkgrid")
sns.countplot(x = "survived", hue= "adult_male", data = df)

In [None]:
sns.set_style("darkgrid")
sns.countplot(x = "survived", hue= "sex", data = df)

In [None]:
sns.set_style("darkgrid")
sns.countplot(x = "survived", hue= "embark_town", data = df)

In [None]:
df.embark_town.value_counts()

In [None]:
df.drop(["age", "sibsp", "parch"], axis = 1, inplace = True)

In [None]:
df.head()

In [None]:
sex = pd.get_dummies(df["sex"], drop_first = True)
who = pd.get_dummies(df["who"], drop_first = True)
adult_male = pd.get_dummies(df["adult_male"], drop_first = True)
embark_town = pd.get_dummies(df["embark_town"], drop_first = True)
alone = pd.get_dummies(df["alone"], drop_first = True)

In [None]:
df = pd.concat([df, sex, who, adult_male, embark_town, alone], axis = 1)

In [None]:
df.head()

In [None]:
df.drop(["sex", "who", "adult_male", "embark_town", "alone"], axis = 1, inplace = True)

In [None]:
df.head()

In [None]:
plt.figure(figsize = (10, 3))
sns.heatmap(df.corr(), annot = True) #will not work 

**Modelling** WIP

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = df.drop("survived", axis = 1)
X = X.rename(str,axis="columns") #needed to fix an error with column names
y = df["survived"]
X.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, random_state = 42)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lm_model = LogisticRegression()

In [None]:
lm_model.fit(X_train, y_train)

In [None]:
lm_model.intercept_

In [None]:
coef_param = pd.DataFrame(lm_model.coef_, index = X.columns, columns = ["Coefficient"])
coef_param

In [None]:
y_pred = lm_model.predict(X_test)

In [None]:
my_dict = {"Actual" : y_test, "Pred" : y_pred}
compare = pd.DataFrame(my_dict)

In [None]:
compare.sample(10)

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [None]:
def evaluation_metrics(actual, pred):
    MAE = mean_absolute_error(actual, pred)
    MSE = mean_squared_error(actual, pred)
    RMSE = np.sqrt(mean_squared_error(actual, pred))
    SCORE = r2_score(actual, pred)
    return print("r2_score:", SCORE, "\n","mae:", MAE, "\n","mse:",MSE, "\n","rmse:",RMSE)

In [None]:
evaluation_metrics(y_test, y_pred)

In [None]:
!pip install yellowbrick

In [None]:
from yellowbrick.regressor import PredictionError

# Instantiate the visualizer
visualizer = PredictionError(lm_model)

visualizer.fit(X_train, y_train)  # Fit the training data to the visualizer
visualizer.score(X_test, y_test)  # Evaluate the model on the test data

visualizer.show() # Finalize and render the figure

In [None]:
from yellowbrick.regressor import ResidualsPlot

# Instantiate the visualizer
visualizer = ResidualsPlot(lm_model)

visualizer.fit(X_train, y_train) # Fit the training data to the visualizer
visualizer.score(X_test, y_test) # Evaluate the model on the test data

visualizer.show() # Finalize and render the figure

In [None]:
lr_model = LogisticRegression()

In [None]:
lr_model.fit(X_train, y_train)

In [None]:
y_pred = lr_model.predict(X_test)
y_pred

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
cnf_matrix = confusion_matrix(y_test, y_pred)

In [None]:
cnf_matrix

In [None]:
my_dict = {"Actual" : y_test, "Pred" : y_pred}
compare = pd.DataFrame(my_dict)

In [None]:
compare.sample(10)