In [None]:
import pandas as pd

# Load the Titanic dataset into a DataFrame
df = pd.read_csv('Titanic-Dataset.csv')

# --- Data Exploration ---
# Display total number of rows and columns
print('Total rows and columns:', df.shape)

# Get a summary of the dataset (data types, non-null counts, etc.)
print(df.info())

# Generate descriptive statistics for numerical columns (mean, std, min, max)
print(df.describe())

# --- Data Selection & Slicing ---
# Extract and display the list of column names
columns_list = df.columns.tolist()
print("Columns:", columns_list)

# Create a subset DataFrame with specific features
new_df = df[['Name', 'Sex', 'Age']]
print(new_df.head())

# Concatenate the first 10 rows and the last 5 rows for a quick overview
result = pd.concat([df.iloc[:10], df.iloc[-5:]])
print(result)

# --- Categorical Analysis ---
# Count occurrences of unique values in 'Survived' and 'Sex' columns
print("Survival Count:\n", df['Survived'].value_counts())
print("Gender Distribution:\n", df['Sex'].value_counts())

# --- Data Filtering ---
# Filter data: Female passengers older than 30
female_over_30 = df[(df['Sex'] == 'female') & (df['Age'] > 30)]
print("Females over 30:\n", female_over_30)

# Filter data: Passengers in 1st Class who did not survive
first_class_non_survivors = df[(df['Pclass'] == 1) & (df['Survived'] == 0)]
print("1st Class Non-Survivors:\n", first_class_non_survivors)

In [None]:
import pandas as pd

# Load the raw Titanic dataset
df = pd.read_csv('Titanic-Dataset.csv')

# --- Data Cleaning ---
# Check for missing values in each column
print(df.isnull().sum())

# Handle missing values in 'Age' by filling them with the median age
median_age = df['Age'].median()
df['Age'] = df['Age'].fillna(median_age)

# Drop the 'Cabin' column as it has too many missing values to be useful
df = df.drop(['Cabin'], axis=1)

# Fill missing values in 'Embarked' with the most common port ('S' for Southampton)
df['Embarked'] = df['Embarked'].fillna('S')

# Verify that there are no more missing values
print(df.isnull().sum())

# (Optional) Save the cleaned dataset to a new CSV file
# df.to_csv('Titanic-Dataset-Cleaned.csv', index=False)

# --- Data Analysis & Aggregation ---
# Assuming 'df_clean' is our processed data
df_clean = df 

# Calculate the average age grouped by gender
average_age = df_clean.groupby('Sex')['Age'].mean()
print("Average Age by Sex:\n", average_age)

# Calculate survival rate grouped by Passenger Class
average_pclass = df_clean.groupby('Pclass')['Survived'].mean()
print("Survival Rate by Class:\n", average_pclass)

# Create a pivot table to see Survival rates across both Sex and Pclass
pivot = df_clean.pivot_table(values='Survived', index='Sex', columns='Pclass')
print("Survival Pivot Table:\n", pivot)

# --- Feature Engineering ---
# Extract 'Title' (Mr, Mrs, Miss, etc.) from the 'Name' column using Regex
df_clean['Title'] = df_clean['Name'].str.extract('([A-Za-z]+)\.', expand=False)

# Mapping Categorical 'Sex' to Numerical values (Encoding)
df_clean['Sex_Encoded'] = df_clean['Sex'].map({'male': 1, 'female': 0})

# Create a new feature 'FamilySize' by combining Siblings/Spouses and Parents/Children
# Adding 1 to include the passenger themselves
df_clean['FamilySize'] = df_clean['SibSp'] + df_clean['Parch'] + 1

# Display the relationship between family components and the new FamilySize feature
print(df_clean[['SibSp', 'Parch', 'FamilySize']].head())

In [None]:
import pandas as pd

# Load the cleaned Titanic dataset
df = pd.read_csv('Titanic-Dataset-Cleaned.csv')

# --- Outlier Detection using Interquartile Range (IQR) ---

# Calculate the 1st Quartile (25th percentile) and 3rd Quartile (75th percentile)
q1 = df['Fare'].quantile(0.25)
q3 = df['Fare'].quantile(0.75)

# Calculate the IQR (The range where the middle 50% of the data lies)
iqr = q3 - q1

# Define the lower and upper boundaries for outliers
# Standard practice is 1.5 times the IQR below Q1 and above Q3
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

# Display the statistical boundaries
print(f"IQR for Fare: {iqr:.2f}")
print(f"Lower Boundary: {lower_bound:.2f}")
print(f"Upper Boundary: {upper_bound:.2f}")

# --- Identifying Outliers ---

# Filter the dataset for rows where 'Fare' is outside the calculated boundaries
outliers = df[(df['Fare'] > upper_bound) | (df['Fare'] < lower_bound)]

# Display the total count and a preview of the outlier data points
print(f"Total Outliers Detected: {len(outliers)}")
print("Preview of Outlier Records (Name and Fare):")
print(outliers[['Name', 'Fare']].head())

In [None]:
import pandas as pd

# --- Categorical Encoding ---
# Perform One-Hot Encoding on 'Sex' and 'Embarked' columns
# This converts categorical text into binary (0 or 1) columns for Machine Learning models
df_encoded = pd.get_dummies(df, columns=['Sex', 'Embarked'], dtype=int)
print("Encoded DataFrame Preview:\n", df_encoded.head())

# --- Advanced Filtering ---
# Identify "Solo Travelers" between the ages of 20 and 40
# Criteria: Age [20-40], no siblings/spouses (SibSp=0), and no parents/children (Parch=0)
total_solo_adults = df[(df['Age'] >= 20) & (df['Age'] <= 40) & (df['SibSp'] == 0) & (df['Parch'] == 0)]
print("Solo Travelers (Age 20-40):\n", total_solo_adults)

# --- Data Sorting & Grouping ---
# Reload cleaned data for a fresh analysis
df = pd.read_csv('Titanic-Dataset-Cleaned.csv')

# Find the top 3 most expensive tickets (Fares) for each Passenger Class (Pclass)
# We sort by Fare descending, then group by Pclass and take the top 3 results
top_3_fares = df.sort_values('Fare', ascending=False).groupby('Pclass').head(3)
print("Top 3 Highest Fares per Class:\n", top_3_fares[['Pclass', 'PassengerId', 'Name', 'Fare']])

# --- Merging DataFrames ---
# Creating two separate subsets to demonstrate a "Join" or "Merge" operation
df1 = df[['PassengerId', 'Name']]
df2 = df[['PassengerId', 'Ticket']]

# Merge (Join) the two DataFrames on the common key 'PassengerId'
# This is similar to a SQL INNER JOIN
merged_df = pd.merge(df1, df2, on='PassengerId')
print("Successfully Merged DataFrame:\n", merged_df.head())