<a href="https://colab.research.google.com/github/maleehasiddiqui20/dataprojects/blob/main/Logistic_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from seaborn import load_dataset # this method will help us to #download the Titanic dataset

In [2]:
data = load_dataset("titanic")
data

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


**Data cleaning**

In [3]:
data.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [4]:
#Filling age null values with mean
mean = data['age'].mean()
data['age'].fillna(mean, inplace=True)

In [5]:
#removing deck because too many null values
data.drop("deck",inplace=True,axis=1)

In [6]:
#removing rows with null values (only 2)
data.dropna(inplace=True)

In [7]:
data.isnull().sum()

survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
embark_town    0
alive          0
alone          0
dtype: int64

In [34]:
#To apply logistic regression all the columns should have categorial or numerical values.
binary_sex = pd.get_dummies (data ['sex'],drop_first = True)
binary_pclass = pd.get_dummies (data ['pclass'],drop_first = True)
binary_emb = pd.get_dummies (data ['embarked'],drop_first = True)
binary_who = pd.get_dummies (data ['who'],drop_first = True)
binary_et = pd.get_dummies (data ['embark_town'],drop_first = True)
binary_alive = pd.get_dummies (data ['alive'],drop_first = True)
binary_class = pd.get_dummies (data ['class'],drop_first = True)

modified_data_set = pd.concat ([data, binary_sex, binary_pclass,binary_emb,binary_who,binary_et,binary_alive,binary_class], axis = 1)
final_data_set = modified_data_set.drop (columns = ['sex', 'pclass','embarked','who','embark_town','alive','class'])

**Modelling the data**

In [35]:
Y= final_data_set['survived']
X= final_data_set.drop (['survived'], axis = 1 )

In [36]:
X.head(5)

Unnamed: 0,age,sibsp,parch,fare,adult_male,alone,male,2,3,Q,S,man,woman,Queenstown,Southampton,yes,Second,Third
0,22.0,1,0,7.25,True,False,1,0,1,0,1,1,0,0,1,0,0,1
1,38.0,1,0,71.2833,False,False,0,0,0,0,0,0,1,0,0,1,0,0
2,26.0,0,0,7.925,False,True,0,0,1,0,1,0,1,0,1,1,0,1
3,35.0,1,0,53.1,False,False,0,0,0,0,1,0,1,0,1,1,0,0
4,35.0,0,0,8.05,True,True,1,0,1,0,1,1,0,0,1,0,0,1


In [37]:
Y.head(5)

0    0
1    1
2    1
3    1
4    0
Name: survived, dtype: int64

In [38]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 18 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   age          889 non-null    float64
 1   sibsp        889 non-null    int64  
 2   parch        889 non-null    int64  
 3   fare         889 non-null    float64
 4   adult_male   889 non-null    bool   
 5   alone        889 non-null    bool   
 6   male         889 non-null    uint8  
 7   2            889 non-null    uint8  
 8   3            889 non-null    uint8  
 9   Q            889 non-null    uint8  
 10  S            889 non-null    uint8  
 11  man          889 non-null    uint8  
 12  woman        889 non-null    uint8  
 13  Queenstown   889 non-null    uint8  
 14  Southampton  889 non-null    uint8  
 15  yes          889 non-null    uint8  
 16  Second       889 non-null    uint8  
 17  Third        889 non-null    uint8  
dtypes: bool(2), float64(2), int64(2), uint8(12)
memory

In [39]:
X.columns = X.columns.astype(str)

In [40]:
test_set_size = 0.2
seed = 1

In [41]:
X_train, X_test, Y_train, Y_test = model_selection.train_test_split (X,Y, test_size = test_set_size, random_state = seed)
model = LogisticRegression (solver = 'liblinear')
model.fit (X_train, Y_train)

In [42]:
predictions = model.predict (X_test)
report = classification_report (Y_test, predictions)
print (report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       105
           1       1.00      1.00      1.00        73

    accuracy                           1.00       178
   macro avg       1.00      1.00      1.00       178
weighted avg       1.00      1.00      1.00       178



In [43]:
print (confusion_matrix (Y_test, predictions))

[[105   0]
 [  0  73]]
