<a href="https://colab.research.google.com/github/moizahmed813/TechnoHacks/blob/main/Task_01/Task_01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Task #01:** **Performing** **Data** **Cleaning**

In [87]:
# Importing all necessary classes
import numpy as np
import pandas as pd
import seaborn as sns

%matplotlib inline
from matplotlib import pyplot as plt
from matplotlib import style

In [112]:
test_df = pd.read_csv("test.csv")
train_df = pd.read_csv("train.csv")
gender_submission_df = pd.read_csv("gender_submission.csv")

In [None]:
test_df.info()
train_df.info()
gender_submission_df.info()

In [None]:
train_df.describe()

In [None]:
test_df.describe()

In [None]:
gender_submission_df.describe()

In [None]:
train_df.head(5)
test_df.head(5)
gender_submission_df.head(5)

In [118]:
# Finding NaN (Not a Number) values in the dataframe
missing_data = train_df.isnull()
missing_count = train_df.isnull().sum()
missing_percentage = (train_df.isnull().sum() / len(train_df)) * 100
missing_info = pd.DataFrame({
    "Missing Count": missing_count,
    "Missing Percentage": missing_percentage
})
print(missing_info)

             Missing Count  Missing Percentage
PassengerId              0            0.000000
Survived                 0            0.000000
Pclass                   0            0.000000
Name                     0            0.000000
Sex                      0            0.000000
Age                    177           19.865320
SibSp                    0            0.000000
Parch                    0            0.000000
Ticket                   0            0.000000
Fare                     0            0.000000
Cabin                  687           77.104377
Embarked                 2            0.224467


In [None]:
# Generating random ages to fill the missing values
missing_age_indices = train_df[train_df['Age'].isnull()].index
for index in missing_age_indices:
    random_age = np.random.uniform(train_df['Age'].min(), train_df['Age'].max())
    train_df.loc[index, 'Age'] = random_age
train_df.head()

In [120]:
# Categorizing combined features
data = [train_df, test_df]
for dataset in data:
    dataset['relatives'] = dataset['SibSp'] + dataset['Parch']
    dataset.loc[dataset['relatives'] > 0, 'not_alone'] = 0
    dataset.loc[dataset['relatives'] == 0, 'not_alone'] = 1
    dataset['not_alone'] = dataset['not_alone'].astype(int)
train_df['not_alone'].value_counts()

1    537
0    354
Name: not_alone, dtype: int64

In [99]:
# Merging the gender_submission DataFrame with the training and testing DataFrames
train_merged = pd.merge(train_df, gender_submission_df, on='PassengerId')
test_merged = pd.merge(test_df, gender_submission_df, on='PassengerId')

train_merged.head()
test_merged.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,relatives,not_alone,Survived
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0,1,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,1,0,1
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0,1,0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0,1,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,2,0,1


In [100]:
train_df = train_df.drop(['PassengerId'], axis=1)

In [None]:
test_df['Fare'].describe()
value = '17.152'
data = [test_df]

for dataset in data:
    dataset['Fare'] = dataset['Fare'].fillna(value)
print(test_df.head(155))

In [None]:
# Filling Cabin column

# Extracting the deck information from the Cabin column (assuming the deck is the first letter)
train_df['Deck'] = train_df['Cabin'].str.extract(r'([A-G])', expand=False)
train_df['Cabin'] = train_df['Cabin'].fillna(0)

# Converting deck letters to numeric values
deck_mapping = {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7}
train_df['Deck'] = train_df['Deck'].map(deck_mapping)

# Filling missing deck values with zero
train_df['Deck'] = train_df['Deck'].fillna(0).astype(int)

# Displaying the updated dataframe
print(train_df.head())

In [None]:
# Checking if 'Name' column exists before dropping
if 'Name' in train_df.columns:
    train_df.drop(columns=['Name'], inplace=True)

# Displaying the updated dataframe
print(train_df.head())

In [104]:
# Performing conversion of features to numerics

# Converting "Sex" column to numerics
train_df['Sex'] = train_df['Sex'].replace({'male': 1, 'female': 0})

# Creating a mapping for "Embarked" values to numerics
embarked_mapping = {'C': 0, 'S': 1, 'Q': 2}
train_df['Embarked'] = train_df['Embarked'].replace(embarked_mapping)

# Converting "Fare" column from float to int and deal with NaN values
train_df['Fare'] = train_df['Fare'].fillna(0).astype(int)

# Converting "Age" column from float to int.
train_df['Age'] = train_df['Age'].fillna(0).astype(int)
train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,relatives,not_alone,Deck
0,0,3,1,22,1,0,A/5 21171,7,0,1.0,1,0,0
1,1,1,0,38,1,0,PC 17599,71,C85,0.0,1,0,3
2,1,3,0,26,0,0,STON/O2. 3101282,7,0,1.0,0,1,0
3,1,1,0,35,1,0,113803,53,C123,1.0,1,0,3
4,0,3,1,35,0,0,373450,8,0,1.0,0,1,0


In [105]:
# Checking if 'Ticket' column exists before dropping
if 'Ticket' in train_df.columns:
    train_df.drop(columns=['Ticket'], inplace=True)

# Checking if 'Cabin' column exists before dropping
if 'Cabin' in train_df.columns:
    train_df.drop(columns=['Cabin'], inplace=True)
train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,relatives,not_alone,Deck
0,0,3,1,22,1,0,7,1.0,1,0,0
1,1,1,0,38,1,0,71,0.0,1,0,3
2,1,3,0,26,0,0,7,1.0,0,1,0
3,1,1,0,35,1,0,53,1.0,1,0,3
4,0,3,1,35,0,0,8,1.0,0,1,0


In [121]:
# The above table shows that the data is processed and cleaned.