## Classification task - introduction



In [None]:
## Import Libraries
from sklearn.datasets import fetch_openml
import pandas as pd
import matplotlib as plt
from sklearn.preprocessing import LabelEncoder

# for training
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [10]:
# Load Titanic dataset
titanic_dataset = fetch_openml('titanic', version=1, as_frame=True)

# Create a dataframe called df_titanic from the dataset
df_titanic = titanic_dataset.frame

# Display the head of the DataFrame
df_titanic.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [11]:
# get info about dataframe
df_titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   pclass     1309 non-null   int64   
 1   survived   1309 non-null   category
 2   name       1309 non-null   object  
 3   sex        1309 non-null   category
 4   age        1046 non-null   float64 
 5   sibsp      1309 non-null   int64   
 6   parch      1309 non-null   int64   
 7   ticket     1309 non-null   object  
 8   fare       1308 non-null   float64 
 9   cabin      295 non-null    object  
 10  embarked   1307 non-null   category
 11  boat       486 non-null    object  
 12  body       121 non-null    float64 
 13  home.dest  745 non-null    object  
dtypes: category(3), float64(3), int64(3), object(5)
memory usage: 116.8+ KB


In [12]:
## TASK: create a dataframe called df_simplified based on the dataframe df_titanic but without the columns:
## 'name', 'ticket', 'cabin', 'boat', 'body', and 'home.dest', and remove all rows that contains missing data in at least one column

# Drop specified columns
df_simplified = df_titanic.drop(columns=['name','ticket','cabin', 'boat', 'body', 'home.dest'])

# Remove rows with missing data
df_simplified = df_simplified.dropna()

# Convert categorical variables to numerical values (added to enable training)
label_encoder = LabelEncoder()
df_simplified['sex'] = label_encoder.fit_transform(df_simplified['sex'])
df_simplified['embarked'] = label_encoder.fit_transform(df_simplified['embarked'])

# Display the head of the new DataFrame
df_simplified.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,embarked
0,1,1,0,29.0,0,0,211.3375,2
1,1,1,1,0.9167,1,2,151.55,2
2,1,0,0,2.0,1,2,151.55,2
3,1,0,1,30.0,1,2,151.55,2
4,1,0,0,25.0,1,2,151.55,2


In [13]:
df_simplified.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1043 entries, 0 to 1308
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   pclass    1043 non-null   int64   
 1   survived  1043 non-null   category
 2   sex       1043 non-null   int64   
 3   age       1043 non-null   float64 
 4   sibsp     1043 non-null   int64   
 5   parch     1043 non-null   int64   
 6   fare      1043 non-null   float64 
 7   embarked  1043 non-null   int64   
dtypes: category(1), float64(2), int64(5)
memory usage: 66.3 KB


In [14]:
# Count how many passengers survived in percentage
survival_counts = df_simplified['survived'].value_counts(normalize=True) * 100
print(survival_counts)

survived
0    59.252157
1    40.747843
Name: proportion, dtype: float64


<span style="font-size:28px">Classification</span>

In [15]:
# Define the feature matrix X and the target vector y for survived
X = df_simplified.drop(columns=['survived'])
y = df_simplified['survived']

# Display the first few rows of X and y
print(X.head())
print(y.head())

   pclass  sex      age  sibsp  parch      fare  embarked
0       1    0  29.0000      0      0  211.3375         2
1       1    1   0.9167      1      2  151.5500         2
2       1    0   2.0000      1      2  151.5500         2
3       1    1  30.0000      1      2  151.5500         2
4       1    0  25.0000      1      2  151.5500         2
0    1
1    1
2    0
3    0
4    0
Name: survived, dtype: category
Categories (2, object): ['0', '1']


In [16]:
# Split data for train and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=18,stratify=y)
# stratify=y: makes the proportions between parameters the same within the whole datasets

In [None]:
log_reg = LogisticRegression()
log_reg.fit(X_train,y_train)
log_reg_y_hat_test = log_reg.predict(X_test)
log_reg_y_hat_train = log_reg.predict(X_train)

RF = RandomForestClassifier()
RF.fit(X_train,y_train)
RF_y_hat_test = RF.predict(X_test)
RF_y_hat_train = RF.predict(X_train)

# Calculate accuracy
log_reg_train_accuracy = accuracy_score(y_train, log_reg_y_hat_train)
log_reg_test_accuracy = accuracy_score(y_test, log_reg_y_hat_test)
RF_train_accuracy = accuracy_score(y_train, RF_y_hat_train)
RF_test_accuracy = accuracy_score(y_test, RF_y_hat_test)

print(f"LogReg: Training Accuracy: {log_reg_train_accuracy}")
print(f"LogReg: Testing Accuracy: {log_reg_test_accuracy}")
print(f"RandomForest: Training Accuracy: {RF_train_accuracy}")
print(f"RandomForest: Testing Accuracy: {RF_test_accuracy}")

LogReg: Training Accuracy: 0.7913669064748201
LogReg: Testing Accuracy: 0.784688995215311
RandomForest: Training Accuracy: 0.9844124700239808
RandomForest: Testing Accuracy: 0.784688995215311


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
