In [119]:
import pandas as pd
import seaborn as sns

In [120]:
df=sns.load_dataset("titanic")

In [121]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [122]:
# rename the column

In [123]:
df.columns

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone'],
      dtype='object')

In [124]:
df = df.rename(columns={'sex':'gender'})
df.columns

Index(['survived', 'pclass', 'gender', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone'],
      dtype='object')

In [125]:
df = df.rename(columns={'sex':'gender','class':'class_name'})
df.columns

Index(['survived', 'pclass', 'gender', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class_name', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone'],
      dtype='object')

In [126]:
# identifiers outliers using IQR
import numpy as np

In [127]:
df['fare'].isna().sum()

0

In [128]:
fare = df['fare'].dropna()


In [129]:
q1 = fare.quantile(0.25)

q3 = fare.quantile(0.75)

iqr = q3-q1
lower_bound = q1 - 1.5*iqr
upper_bound = q3 + 1.5*iqr
outliers = df[(df['fare']< lower_bound)|(df['fare']> upper_bound)]
print("Outliers",outliers[['fare']])

Outliers          fare
1     71.2833
27   263.0000
31   146.5208
34    82.1708
52    76.7292
..        ...
846   69.5500
849   89.1042
856  164.8667
863   69.5500
879   83.1583

[116 rows x 1 columns]


In [130]:
# CREATING NEW COLUMNS IN EXISTING DATASET

In [131]:
df['age_category'] = df["age"].apply(lambda x:"senior" if x >= 60 else "adult" if x>=18 else "child")
print(df[['age','age_category']].head(10))

    age age_category
0  22.0        adult
1  38.0        adult
2  26.0        adult
3  35.0        adult
4  35.0        adult
5   NaN        child
6  54.0        adult
7   2.0        child
8  27.0        adult
9  14.0        child


In [132]:
#groupby and agrregation: simply data summarization by using aggregation function

In [133]:
df_groupby = df.groupby('gender').agg({'fare':['mean','min','max']})
df_groupby

Unnamed: 0_level_0,fare,fare,fare
Unnamed: 0_level_1,mean,min,max
gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
female,44.479818,6.75,512.3292
male,25.523893,0.0,512.3292


In [134]:

#pivoting data : summarizing in the form of tables

In [146]:
df_pivot = df.pivot_table(index='gender',columns = 'class_name',values = 'fare',aggfunc = 'mean')
df_pivot

  df_pivot = df.pivot_table(index='gender',columns = 'class_name',values = 'fare',aggfunc = 'mean')


class_name,First,Second,Third
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,106.125798,21.970121,16.11881
male,67.226127,19.741782,12.661633


In [136]:
# merging two dataframes:

In [137]:
df1 = sns.load_dataset('titanic')
df2 = sns.load_dataset('tips')
merged_df=pd.merge(df1,df2,on ='sex')
print(merged_df.head())

Empty DataFrame
Columns: [survived, pclass, sex, age, sibsp, parch, fare, embarked, class, who, adult_male, deck, embark_town, alive, alone, total_bill, tip, smoker, day, time, size]
Index: []

[0 rows x 21 columns]


In [138]:
# filling the null values by using mean or median

In [139]:
df1 = sns.load_dataset('titanic')


In [140]:
df.head(5)

Unnamed: 0,survived,pclass,gender,age,sibsp,parch,fare,embarked,class_name,who,adult_male,deck,embark_town,alive,alone,age_category
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False,adult
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,adult
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True,adult
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False,adult
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True,adult


In [141]:
df1.isna().sum()

Unnamed: 0,0
survived,0
pclass,0
sex,0
age,177
sibsp,0
parch,0
fare,0
embarked,2
class,0
who,0


In [147]:
df1['age'].fillna(df1['age'].mean(),inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df1['age'].fillna(df1['age'].mean(),inplace = True)


In [148]:
df1.isna().sum()

Unnamed: 0,0
survived,0
pclass,0
sex,0
age,0
sibsp,0
parch,0
fare,0
embarked,2
class,0
who,0


In [144]:
# where conditional cleaning(where function we use for cleaning th data)

In [145]:
df['fare_category']=np.where(df['fare']>50,'High','Low')
df.head()

Unnamed: 0,survived,pclass,gender,age,sibsp,parch,fare,embarked,class_name,who,adult_male,deck,embark_town,alive,alone,age_category,fare_category
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False,adult,Low
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,adult,High
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True,adult,Low
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False,adult,High
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True,adult,Low
