In [257]:
import pandas as pd
from sklearn.linear_model import LogisticRegression

### 1.a Downloading and loading in training data

In [258]:
train_df = pd.read_csv(r'C:\Users\marti\PycharmProjects\AML-assignments\hw1\titanic\train.csv')

Preliminary evaluation

In [259]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [260]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


### 1.b Cleaning data and selecting features
We can see that Age, Cabin, and Embarked have empty data. We will need to reconcile this.

Dropping columns not useful to our training model. These being arbitrary passenger information.

In [261]:
train_df = train_df.drop(["PassengerId","Name","Ticket"], axis=1)

We now need to set an age for the rows where there is no entry. Let's use the average age.

In [262]:
average_age = train_df["Age"].mean()
age_empty = train_df["Age"].isna()
train_df["Age"][age_empty] = average_age
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       891 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Cabin     204 non-null    object 
 8   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(3)
memory usage: 62.8+ KB


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df["Age"][age_empty] = average_age


There is not nearly enough cabin data for it to be a good feature. We will drop it as well.

In [263]:
train_df = train_df.drop(["Cabin"], axis=1)

Now we must deal with the 2 missing Embarked rows. Let's do one hot encoding and simply omit them.

In [264]:
train_df = pd.get_dummies(train_df)

Assuming binary gender classification, we can omit one of the sex columns as it is redundant.

In [265]:
train_df = train_df.drop('Sex_male', axis=1)
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Survived    891 non-null    int64  
 1   Pclass      891 non-null    int64  
 2   Age         891 non-null    float64
 3   SibSp       891 non-null    int64  
 4   Parch       891 non-null    int64  
 5   Fare        891 non-null    float64
 6   Sex_female  891 non-null    uint8  
 7   Embarked_C  891 non-null    uint8  
 8   Embarked_Q  891 non-null    uint8  
 9   Embarked_S  891 non-null    uint8  
dtypes: float64(2), int64(4), uint8(4)
memory usage: 45.4 KB


Let's now look at the test data. We can fill in the null values again with means.

In [266]:
test_df = pd.read_csv(r'C:\Users\marti\PycharmProjects\AML-assignments\hw1\titanic\test.csv')
PassengerID = test_df["PassengerId"]
test_df = test_df.drop(["PassengerId","Name","Ticket"], axis=1)
test_df["Age"].fillna(value=test_df["Age"].mean(), inplace=True)
test_df["Fare"].fillna(value=test_df["Fare"].mean(), inplace=True)

In [267]:
test_df = test_df.drop(["Cabin"], axis=1)
test_df = pd.get_dummies(test_df)
test_df = test_df.drop(["Sex_male"], axis=1)
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Pclass      418 non-null    int64  
 1   Age         418 non-null    float64
 2   SibSp       418 non-null    int64  
 3   Parch       418 non-null    int64  
 4   Fare        418 non-null    float64
 5   Sex_female  418 non-null    uint8  
 6   Embarked_C  418 non-null    uint8  
 7   Embarked_Q  418 non-null    uint8  
 8   Embarked_S  418 non-null    uint8  
dtypes: float64(2), int64(3), uint8(4)
memory usage: 18.1 KB


### 1.c Training the classifier

In [268]:
X_train = train_df.drop(['Survived'], axis=1)
Y_train = train_df['Survived']

In [269]:
model = LogisticRegression()
model.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [270]:
X_test = test_df
Y_predicted = model.predict(X_test)
type(PassengerID)

pandas.core.series.Series

Saving prediction data to CSV

In [271]:
predictions = pd.Series.to_frame(PassengerID, name = 'PassengerId')
predictions['Survived'] = Y_predicted
predictions.head()
predictions.to_csv('predictions.csv', index=False)