In [None]:
# Updating data types
books['year'] = books['year'].astype(int)
books.dtypes

# validating categorical data
books['genre'].isin(['Fiction', 'Non Fiction'])

# Named summary columns
books.groupby('genre').agg(mean_rating=('rating', 'mean'), std_rating=('rating', 'std'), median_year=('year', 'median'))

In [None]:
# Checking for missing values
print(salaries.isna().sum())

# dropping missing values
thresold = len(salaries) * 0.05
print(thresold)
cols_to_drop = salaries.columns[salaries.isna().sum() <= thresold]
print(cols_to_drop)
salaries.dropna(subset=cols_to_drop, inplace=True)

# Imputing summary statistic
cols_with_missing_values = salaries.columns[salaries.isna().sum() > 0]
print(cols_with_missing_values)
for col in cols_with_missing_values[:-1]:
    salaries['col'].fillna(salaries[col].mode()[0])

# Imputing by sub-group
salaries_dict = salaries.groupby('Experience')['Salary_USD'].median().to_dict()
print(salaries_dict)
salaries['Salary_USD'] = salaries['Salary_USD'].fillna(salaries['Experince'].map(salaries_dict))


In [2]:
# previewing the data
print(salaries.select_dtypes('object').head())

# extracting value from categories
salaries['Designation'].str.contains('Scientist')

# findinf multiple phrases in strings
salaries['Designation'].str.contains('Machine Learning|AI')
salaries['Designation'].str.contains('^Data')   # starting with phrase data

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

job_categories = ['Data Science', 'Data Analytics', 'Data Engineering', 'Machine Learning', 'Managerial', 'Consultant']
data_science = 'Data Scientist|NLP'
data_analyst = 'Analyst|Analytics'
data_engineer = 'Data Engineer|ETL|Architect|Infrastructure'
ml_engineer = 'Machine Learning|ML|Big Data|AI'
manager = 'Manager|Head|Director|Lead|Principal|Staff'
consultant = 'Consultant|Freelance'
conditions = [
    (salaries['Designation'].str.contains(data_science)),
    (salaries['Designation'].str.contains(data_analyst)),
    (salaries['Designation'].str.contains(data_engineer)),
    (salaries['Designation'].str.contains(ml_engineer)),
    (salaries['Designation'].str.contains(manager)),
    (salaries['Designation'].str.contains(consultant)),
]
salaries['Job_Category'] = np.select(conditions, job_categories, default = 'Other')
print(salaries[['Designation', 'Job_Category']].head())
sns.countplot(data=salaries, x='Job_Category')
plt.show()

TypeError: type 'DataFrame' is not subscriptable

In [None]:
# converting strings to numbers
pd.Series.str.replace('characters to remove', 'characters to replace them with')

salaries['Salary_In_Rupees'] = salaries['Salary_In_Rupees'].str.replace(',', "")
print(salary['Salary_In_Rupees'].head())

salaries['Salary_In_Rupees'] = salaries['Salary_In_Rupees'].astype(float)
salaries['Salary_In_Rupees'] = salaries['Salary_In_Rupees'] * 0.012

# adding summary statistics into a DataFrame
salaries['std_dev'] = salaries.groupby('Experience')['Salary_USD'].transform(lambda x: x.std())


In [None]:
# Indentifying thresholds
seventy_fifth = salaries['Salary_USD'].quantile(0.75)
twenty_fifth = salaries['Salary_USD'].quantile(0.25)
salaries_iqr = seventy_fifth - twenty_fifth
print(salaries_iqr)

# identifying outliers
upper = seventy_fifth + (1.5 * salaries_iqr)
lower = twenty_fifth - (1.5 * salaries_iqr)
print(upper, lower)

# subsetting our data
salaries[(salaries['Salary_USD'] < lower) | (salaries['Salary_USD'] > upper)] \
    [['Experience', 'Employee_Location', 'Salary_USD']]

# dropping outliers
no_outliers = salaries[(salaries['Salary_USD'] > lower) & (salaries['Salary_USD'] < upper)]
print(no_outliers['Salary_USD'].describe())

In [None]:
# patterns over time
divorce = pd.read_csv('divorce.csv', parse_dates=['marriage_date'])
divorce.dtypes

# converting to datatime data
divorce['marriage_date'] = pd.to_datetime(divorce['marriage_date'])
divorce.dtypes

#extracting month attribute
divorce['marriage_month'] = divorce['marriage_date'].dt.month
divorce.head()

In [None]:
# correlation
divorce.corr()

# correlation heatmaps
sns.heatmap(divorce.corr(), annot=True)
plt.show()

# pairplots
sns.pairplot(data=divorce, vars=['income_man', 'income_woman', 'marriage_duration'])
plt.show()

cut tells seaborn how far past the minmum and maximum data values the curve should go when smoothing is applied.
cut = 0, the curve will be limited to values between the minimum and maximum x values.

if interested in cumulative funcation then set it to True.

In [None]:
# factor relationships and distributions
sns.histplot(data=divorce, x='marriage_duration',hue = 'education_man', binwidth=1)
plt.show()

# kernel density estimate (KDE) plots
sns.kdeplot(data=divorce, x='marriage_duration', hue='education_man', cut=0, cumulative=True)
plt.show()

# relationship between marriage age and education
divorce['man_age_marriage'] = divorce['marriage_year'] - divorce['dob_man'].dt.year
divorce['woman_age_marriage'] = divorce['marriage_year'] - divorce['dob_woman'].dt.year
sns.scatterplot(data=divorce, x='woman_age_marriage', y='man_age_marriage', hue='education_man')
plt.show()

In [None]:
# considerations for categorical data
# relative class frequency
planes['Destination'].value_counts(normalize=True)

# cross-tabulation
pd.crosstab(planes['Source'], planes['Destination'], values=planes['Price'], aggfunc='median')

In [None]:
# Generating new features
# Correlation
sns.heatmap(planes.corr(), annot=True)
plt.show()
print(planes.dtypes)

# Total stops
print(planes['Total_Stops'].value_counts())

# Cleaning total stops
planes['Total_Stops'] = planes["Total_Stops"].str.replace(" stops", "")
planes['Total_Stops'] = planes["Total_Stops"].str.replace(" stop", "")
planes['Total_Stops'] = planes["Total_Stops"].str.replace("non-stop", "")
planes['Total_Stops'] = planes["Total_Stops"].astype(int)

# Extracting month and weekday
planes['month'] = planes['Date_of_Journey'].dt.month
planes['weekday'] = planes['Date_of_Journey'].dt.weekday
print(planes[['month', 'weekday', 'Date_of_Journey']].head())

# Departure and arrival times
planes['Dep_Hour'] = planes['Dep_Time'].dt.hour
planes['Arrival_Hour'] = planes['Arrival_Time'].dt.hour

# Descriptive statistics
twenty_fifth = planes['Price'].quantile(0.25)
median = planes['Price'].median()
seventy_fifth = planes['Price'].quantile(0.75)
maximum = planes['Price'].max()

# Labels and bins
labels = ['Economy', 'Premium Economy', 'Business Class', 'First Class']
bins = [0, twenty_fifth, median, seventy_fifth, maximum]

# pd.cut()
planes["Price_Category"] = pd.cut(planes['Price'], labels=labels, bins=bins)


In [None]:
# generating hypotheses
sns.barplot(data=planes, x='Airline', y='Duration')
plt.show()