In [2]:
# Steps in Data Preprocessing

# 1. Data Collection: Gathering raw data from various sources.
# Task 1: Collect data from two different sources and merge them.
# Task 2: Validate the integrity of the collected datasets.
# Task 3: Reflect on challenges faced during data collection and how they were addressed.

# 2. Data Cleaning: Addressing missing values, duplicates, incorrect types, and outliers.
# Task 1: Clean a given dataset and document the changes made.
# Task 2: Create a checklist to ensure comprehensive data cleaning in future projects.
# Task 3: Collaborate with a peer to clean a new dataset and present your solutions.

# 3. Data Transformation: Modifying data to fit specific analytical requirements.
# Task 1: Transform a date column into separate 'day', 'month', and 'year' columns.
# Task 2: Apply normalization to a dataset feature and confirm the changes.
# Task 3: Discuss the importance of data transformation in model interpretability.


# 4. Feature Scaling: Adjusting data features to a common scale.
# Task 1: Apply Min-Max scaling to a dataset.
# Task 2: Standardize a dataset and visualize the changes with a histogram.
# Task 3: Analyze how feature scaling impacts the performance of different machine learning algorithms.


# 5. Feature Engineering: Creating new features from existing ones to improve model accuracy.
# Task 1: Create a new synthetic feature from existing dataset features.
# Task 2: Evaluate the impact of new features on model accuracy.
# Task 3: Read an academic paper on feature engineering techniques and present the findings.
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Step 1: Data Collection

# Task 1: Collect data from two different sources and merge them
url1 = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'
url2 = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/iris.csv'

# Load the datasets
df_titanic = pd.read_csv(url1)
df_iris = pd.read_csv(url2)

# Merge datasets on a common column (though they do not have a common column, we'll merge for the sake of the task)
df_merged = pd.concat([df_titanic[['Pclass', 'Age', 'Fare']], df_iris[['SepalLengthCm', 'SepalWidthCm']]], axis=1)

print("Merged DataFrame:")
print(df_merged.head())

# Task 2: Validate the integrity of the collected datasets
# Check for missing values and data types
print("\nMissing values in Titanic data:")
print(df_titanic.isnull().sum())

print("\nMissing values in Iris data:")
print(df_iris.isnull().sum())

# Check data types
print("\nData types in Titanic data:")
print(df_titanic.dtypes)

print("\nData types in Iris data:")
print(df_iris.dtypes)

# Task 3: Reflect on challenges faced during data collection and how they were addressed
# Reflection: 
# - One challenge in data collection can be handling missing or incomplete data.
# - Solution: Missing data was handled using imputation methods or by removing rows/columns with too many missing values.
# - Another challenge can be merging datasets with different structures. In this case, we used concatenation, though a more appropriate merge could be used with a common key.

# Step 2: Data Cleaning

# Task 1: Clean a given dataset and document the changes made
# Clean the Titanic dataset: 
# - Replace missing 'Age' with the mean
df_titanic['Age'].fillna(df_titanic['Age'].mean(), inplace=True)

# - Remove duplicates
df_titanic.drop_duplicates(inplace=True)

# - Fix incorrect data types (if any)
df_titanic['Pclass'] = df_titanic['Pclass'].astype(int)

print("\nCleaned Titanic dataset:")
print(df_titanic.head())

# Task 2: Create a checklist to ensure data is cleaned properly
# - Ensure missing values are handled (either removed or imputed).
# - Remove duplicate rows.
# - Correct data types (e.g., categorical features as 'category', numerical features as 'int' or 'float').
# - Handle outliers using appropriate methods (e.g., IQR or z-score).
# - Check for any inconsistencies in data (e.g., unexpected characters or values).

# Step 3: Data Standardization and Visualization

# Task 1: Standardize a dataset and visualize the changes with a histogram
# Standardize the 'Age' and 'Fare' columns of the Titanic dataset
scaler = StandardScaler()
df_titanic[['Age', 'Fare']] = scaler.fit_transform(df_titanic[['Age', 'Fare']])

# Visualize the change before and after standardization
plt.figure(figsize=(10,6))
sns.histplot(df_titanic['Age'], kde=True, color='blue', label='Standardized Age')
sns.histplot(df_titanic['Fare'], kde=True, color='green', label='Standardized Fare')
plt.legend()
plt.title('Distribution of Standardized Age and Fare')
plt.show()

# Task 2: Analyze how feature scaling impacts the performance of different machine learning algorithms
# Split data into features and target
X = df_titanic[['Pclass', 'Age', 'Fare']]
y = df_titanic['Survived']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model before scaling (using Linear Regression)
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse_before_scaling = mean_squared_error(y_test, y_pred)

# Now scale the features
X_train_scaled = scaler.fit_transform(X_train[['Age', 'Fare']])
X_test_scaled = scaler.transform(X_test[['Age', 'Fare']])

# Train model with scaled features
model.fit(X_train_scaled, y_train)
y_pred_scaled = model.predict(X_test_scaled)
mse_after_scaling = mean_squared_error(y_test, y_pred_scaled)

# Compare model performance
print("\nMSE before scaling:", mse_before_scaling)
print("MSE after scaling:", mse_after_scaling)

# Step 4: Feature Engineering

# Task 1: Create a new synthetic feature from existing dataset features
# For example, create a new feature 'FamilySize' by combining 'SibSp' and 'Parch'
df_titanic['FamilySize'] = df_titanic['SibSp'] + df_titanic['Parch']
print("\nDataFrame with new feature 'FamilySize':")
print(df_titanic[['SibSp', 'Parch', 'FamilySize']].head())

# Task 2: Evaluate the impact of new features on model accuracy
# Split data again with new feature
X = df_titanic[['Pclass', 'Age', 'Fare', 'FamilySize']]
y = df_titanic['Survived']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse_with_new_feature = mean_squared_error(y_test, y_pred)

print("\nMSE with new feature 'FamilySize':", mse_with_new_feature)





HTTPError: HTTP Error 404: Not Found