In [1]:
import pandas as pd
import csv

In [2]:
path = "Titanic-Dataset.csv"
df = pd.read_csv(path)

In [3]:
print(df.head(891)) #dataset has 891 rows and 12 cols
print(df.info()) #cols 5, 10 and 11 have null values
print(df.describe()) #7 cols are numerical, age having null values to be replaced by mean/median 
print(df.isnull().sum()) #Age, Cabin and Embarked have null values

     PassengerId  Survived  Pclass  \
0              1         0       3   
1              2         1       1   
2              3         1       3   
3              4         1       1   
4              5         0       3   
..           ...       ...     ...   
886          887         0       2   
887          888         1       1   
888          889         0       3   
889          890         1       1   
890          891         0       3   

                                                  Name     Sex   Age  SibSp  \
0                              Braund, Mr. Owen Harris    male  22.0      1   
1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                               Heikkinen, Miss. Laina  female  26.0      0   
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                             Allen, Mr. William Henry    male  35.0      0   
..                                                 ...     ...   ... 

In [4]:
#find duplicated duplicates
duplicatedRows = df[df.duplicated()]
print(duplicatedRows) #no duplicates found

Empty DataFrame
Columns: [PassengerId, Survived, Pclass, Name, Sex, Age, SibSp, Parch, Ticket, Fare, Cabin, Embarked]
Index: []


In [5]:
#Handle outliers 
Q1 = df['Age'].quantile(0.25)
Q3 = df['Age'].quantile(0.75)
IQR = Q3 - Q1
threshhold = 1.5
lower_bound = Q1 - threshhold * IQR
upper_bound = Q3 + threshhold * IQR
print("Lower Bound:", lower_bound, "Upper Bound:", upper_bound)
outliers = df[(df['Age'] < lower_bound) | (df['Age'] > upper_bound)]
print('Outliers Printing')
print(outliers) #data has outliers

Lower Bound: -6.6875 Upper Bound: 64.8125
Outliers Printing
     PassengerId  Survived  Pclass                                  Name  \
33            34         0       2                 Wheadon, Mr. Edward H   
54            55         0       1        Ostby, Mr. Engelhart Cornelius   
96            97         0       1             Goldschmidt, Mr. George B   
116          117         0       3                  Connors, Mr. Patrick   
280          281         0       3                      Duane, Mr. Frank   
456          457         0       1             Millet, Mr. Francis Davis   
493          494         0       1               Artagaveytia, Mr. Ramon   
630          631         1       1  Barkworth, Mr. Algernon Henry Wilson   
672          673         0       2           Mitchell, Mr. Henry Michael   
745          746         0       1          Crosby, Capt. Edward Gifford   
851          852         0       3                   Svensson, Mr. Johan   

      Sex   Age  SibSp  Par

In [6]:
#calculate median and mean
median_age = df['Age'].median()
mean_age = df['Age'].mean()
print(median_age)
print(mean_age)

28.0
29.69911764705882


In [7]:
#replace outliers using median
df.loc[(df['Age'] < lower_bound) | (df['Age'] > upper_bound), 'Age'] = median_age
#fill nan for age using median
df['Age'] = df['Age'].fillna(df['Age'].median())

In [8]:
#conver floats to int
df['Age'] = pd.to_numeric(df['Age']).astype(int)

In [9]:
#fill nan for Cabin using mode
df['Cabin'] = df['Cabin'].fillna(df['Cabin'].mode()[0])

In [10]:
#fill nan for Embarked using mode
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

In [11]:
#data after cleaning
print(df.info())
print(df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          891 non-null    int32  
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        891 non-null    object 
 11  Embarked     891 non-null    object 
dtypes: float64(1), int32(1), int64(5), object(5)
memory usage: 80.2+ KB
None
PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64


In [13]:
#export clean data to new file
clean_file_path = "Clean-Titanic-Dataset.csv"
with open(clean_file_path,mode='w',newline='') as file:
    writer = csv.writer(file)
    writer.writerow(df)
    writer.writerows(df.to_numpy())

In [14]:
print("Submitted by: Nouman Karim")

Submitted by: Nouman Karim
