In [233]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression

In [234]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

In [235]:
# Checking number of missing values (Cabin to be removed)
train.info()

# Checking for complex entries to simplify (Name and Ticket to be removed)
train.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [236]:
# Pre-processing the data

def pre_process(data):
    
    # Removing features with many missing values
    data = data.drop(["Cabin"],axis=1)   
    
    # Removing complex features
    data = data.drop(["Name","Ticket"],axis=1)
    
    # Filling missing values for age
    data["Age"].fillna(data["Age"].mean(), inplace = True)
    data["Fare"].fillna(data["Fare"].mean(), inplace = True)

    # Dropping data with 
    data["Embarked"].fillna("Unknown", inplace = True)

    # Replacing categorical data with numerical values
    data["Sex"].replace(["male", "female"], [0,1], inplace = True)
    data["Embarked"].replace(["C", "Q", "S", "Unknown"], [0,1,2,3], inplace = True)

    #Remove passenger id but save it 
    id = pd.DataFrame(data["PassengerId"])
    data = data.drop(["PassengerId"],axis=1)

    # Need to normalise the data
    scalar = MinMaxScaler()
    scalar.fit(data)
    data = pd.DataFrame(scalar.transform(data),columns = data.columns)
    
    return data,id

train, train_id = pre_process(train)
test, test_id = pre_process(test)

In [237]:
# Logistic regression model
LR_model = LogisticRegression()

train_y = train["Survived"]
train_X = train.drop("Survived",axis=1)

# No change to test as it is not part of test set
test_X = test

# Fitting the model
LR_model.fit(train_X, train_y)

# Predictions made
test_y = LR_model.predict(test_X).astype(int)


In [238]:
# Creating submission file
submission_df = test_id
submission_df['Survived'] = test_y.tolist()
submission_df.to_csv('logistic_regression_submission.csv', index = False)