# Simple Data Transformation

This notebook demonstrates various techniques for transforming data in Python using pandas, NumPy, and other libraries. We'll cover simple transformations that are essential for data preprocessing and feature engineering.

## 1. Import Libraries and Setup

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, LabelEncoder, OneHotEncoder

# Set visualization style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set(font_scale=1.2)

# Allow to display all columns
pd.set_option('display.max_columns', None)

### Create sample datasets for demonstration

In [None]:
# Create a simple dataset for demonstration purposes
np.random.seed(42)  # For reproducibility

# Create a basic dataframe with different data types
df = pd.DataFrame({
    'id': range(1, 11),
    'name': ['John Smith', 'Jane Doe', 'Bob Johnson', 'Maria Garcia', 'Wei Chen', 
             'Ahmed Ali', 'Sara Patel', 'Tom Wilson', 'Anna Kim', 'Luis Rodriguez'],
    'age': np.random.randint(18, 65, 10),
    'income': np.random.randint(30000, 120000, 10),
    'education': np.random.choice(['High School', 'Bachelor', 'Master', 'PhD'], 10),
    'score': np.random.uniform(0, 100, 10).round(2),
    'registered': np.random.choice([True, False], 10),
    'join_date': pd.date_range(start='2020-01-01', periods=10, freq='M')
})

# Display the first few rows of the dataframe
print("Sample dataframe:")
df.head()

In [None]:
# Create another dataset with some missing values and outliers
df_messy = pd.DataFrame({
    'product': ['Laptop', 'Phone', 'Tablet', 'Monitor', 'Keyboard', 'Mouse', 'Printer', 'Speaker', 'Headphones', 'Camera'],
    'price': [1200, 800, 500, 300, np.nan, 50, 150, 120, 80, 450],
    'stock': [10, 25, 15, np.nan, 30, 40, 5, np.nan, 20, 10],
    'rating': [4.5, 4.2, 3.8, 4.0, 3.5, 4.7, np.nan, 3.9, 4.1, 4.3],
    'category': ['Electronics', 'Electronics', 'Electronics', 'Electronics', 'Accessories', 
                'Accessories', 'Electronics', 'Accessories', 'Accessories', 'Electronics'],
    'last_updated': pd.date_range(start='2023-01-01', periods=10, freq='3D')
})

print("Dataset with missing values:")
df_messy.head()

## 2. Basic Data Transformations

Let's start with some basic transformations like adding/removing columns, renaming columns, and changing data types.

In [None]:
# 1. Adding a new column based on existing ones
df['income_tier'] = pd.cut(df['income'], 
                          bins=[0, 40000, 80000, 120000], 
                          labels=['Low', 'Medium', 'High'])

# 2. Removing a column
df_reduced = df.drop('id', axis=1)

# 3. Renaming columns
df_renamed = df.rename(columns={'name': 'full_name', 'age': 'years'})

# 4. Changing data types
df['age'] = df['age'].astype(float)
df['score'] = df['score'].astype(int)

# 5. Create a copy of the dataframe for next examples
df_transformed = df.copy()

# Display the results
print("Dataframe with new income_tier column:")
print(df[['name', 'income', 'income_tier']].head())
print("\nDataframe with 'id' column removed:")
print(df_reduced.columns.tolist())
print("\nDataframe with renamed columns:")
print(df_renamed.columns.tolist())
print("\nDataframe with changed data types:")
print(df[['age', 'score']].dtypes)

### Basic Arithmetic Operations on Columns

In [None]:
# Create a new column via arithmetic operation
df_transformed['income_after_tax'] = df_transformed['income'] * 0.7  # 30% tax

# Combine columns with different operations
df_transformed['score_weight'] = df_transformed['score'] * 0.2 + df_transformed['age'] * 0.1

# Apply conditional transformations
df_transformed['bonus'] = np.where(df_transformed['score'] > 70, 1000, 0)

# Display results
print("Dataframe with arithmetic operations:")
df_transformed[['name', 'income', 'income_after_tax', 'score', 'age', 'score_weight', 'bonus']].head()

## 3. Mathematical Transformations

Mathematical transformations are useful for normalizing data distributions, feature engineering, and preparing data for machine learning models.

In [None]:
# Copy DataFrame for mathematical transformations
df_math = df.copy()

# Log transformation (useful for right-skewed data)
df_math['income_log'] = np.log(df_math['income'])

# Square root transformation (another way to handle right skew)
df_math['income_sqrt'] = np.sqrt(df_math['income'])

# Square transformation
df_math['age_squared'] = df_math['age'] ** 2

# Exponential transformation
df_math['score_exp'] = np.exp(df_math['score'] / 50)  # Scaled down to avoid overflow

# Trigonometric transformations (useful for cyclical data)
df_math['sine_age'] = np.sin(df_math['age'] * np.pi / 50)

# Visualization of transformations for income
plt.figure(figsize=(15, 10))

plt.subplot(2, 2, 1)
sns.histplot(df_math['income'], kde=True)
plt.title('Original Income Distribution')

plt.subplot(2, 2, 2)
sns.histplot(df_math['income_log'], kde=True)
plt.title('Log-transformed Income')

plt.subplot(2, 2, 3)
sns.histplot(df_math['income_sqrt'], kde=True)
plt.title('Square Root-transformed Income')

plt.subplot(2, 2, 4)
sns.scatterplot(x='age', y='age_squared', data=df_math)
plt.title('Age vs Age Squared')

plt.tight_layout()
plt.show()

# Display results
df_math[['income', 'income_log', 'income_sqrt', 'age', 'age_squared', 'score', 'score_exp']].head()

### Polynomial Features Creation

In [None]:
# Creating polynomial features (useful for linear models to capture non-linear relationships)
df_math['age_squared'] = df_math['age'] ** 2
df_math['age_cubed'] = df_math['age'] ** 3

# Interaction terms
df_math['age_income_interaction'] = df_math['age'] * df_math['income'] / 10000  # Scaled for readability

# Display results
df_math[['age', 'income', 'age_squared', 'age_cubed', 'age_income_interaction']].head()

## 4. String Transformations

String manipulation is essential for text cleaning, feature extraction, and preparing categorical data.

In [None]:
# Copy DataFrame for string transformations
df_str = df.copy()

# Convert strings to lowercase
df_str['name_lower'] = df_str['name'].str.lower()

# Split a string and extract first and last name
df_str[['first_name', 'last_name']] = df_str['name'].str.split(' ', expand=True)

# Replace specific content in strings
df_str['education_cleaned'] = df_str['education'].str.replace('School', 'Diploma')

# Extract specific patterns (e.g., extracting initials)
df_str['initials'] = df_str['first_name'].str[0] + df_str['last_name'].str[0]

# String concatenation
df_str['full_info'] = df_str['name'] + ' | ' + df_str['education'] + ' | Age: ' + df_str['age'].astype(str)

# String methods with apply and lambda
df_str['name_length'] = df_str['name'].apply(lambda x: len(x))

# Display results
df_str[['name', 'name_lower', 'first_name', 'last_name', 'initials', 'education', 
       'education_cleaned', 'full_info', 'name_length']].head()

In [None]:
# More advanced string operations

# Pattern extraction using regex
df_str['has_vowel_start'] = df_str['first_name'].str.match(r'^[aeiouAEIOU]')

# Count specific characters
df_str['vowel_count'] = df_str['name'].apply(lambda x: sum(1 for char in x.lower() if char in 'aeiou'))

# Padding strings
df_str['id_padded'] = df_str['id'].astype(str).str.zfill(3)

# Extract specific parts of strings (e.g., domain from email)
sample_emails = pd.Series(['john.smith@example.com', 'jane.doe@company.org', 'bob.johnson@test.net'])
domains = sample_emails.str.extract(r'@([^.]+)')

print("Pattern extraction examples:")
print(domains)

# Display results
df_str[['name', 'has_vowel_start', 'vowel_count', 'id', 'id_padded']].head()

## 5. Categorical Data Transformations

Converting categorical variables into numerical representations is crucial for machine learning models.

In [None]:
# Copy DataFrame for categorical transformations
df_cat = df.copy()

# One-hot encoding (good for nominal categories with no inherent order)
education_dummies = pd.get_dummies(df_cat['education'], prefix='edu')
df_cat = pd.concat([df_cat, education_dummies], axis=1)

# Label encoding (good for ordinal categories)
le = LabelEncoder()
education_mapping = {'High School': 0, 'Bachelor': 1, 'Master': 2, 'PhD': 3}
df_cat['education_encoded'] = df_cat['education'].map(education_mapping)

# Binary encoding (for boolean values)
df_cat['registered_int'] = df_cat['registered'].astype(int)

# Ordinal encoding with custom mapping
income_mapping = {'Low': 1, 'Medium': 2, 'High': 3}
df_cat['income_tier_encoded'] = df_cat['income_tier'].map(income_mapping)

# Display the transformations
print("One-hot encoding for education:")
print(df_cat[['education', 'edu_High School', 'edu_Bachelor', 'edu_Master', 'edu_PhD']].head())
print("\nLabel encoding for education:")
print(df_cat[['education', 'education_encoded']].head())
print("\nBinary encoding for registered:")
print(df_cat[['registered', 'registered_int']].head())
print("\nOrdinal encoding for income tier:")
print(df_cat[['income_tier', 'income_tier_encoded']].drop(df_cat[df_cat['income_tier'].isna()].index).head())

In [None]:
# Using scikit-learn for encoding
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

# Reshape the data for sklearn transformers
education_array = df['education'].values.reshape(-1, 1)

# One-hot encoding using scikit-learn
onehot_encoder = OneHotEncoder(sparse_output=False)
education_onehot = onehot_encoder.fit_transform(education_array)
education_onehot_df = pd.DataFrame(
    education_onehot, 
    columns=[f'edu_sk_{cat}' for cat in onehot_encoder.categories_[0]],
    index=df.index
)

# Ordinal encoding using scikit-learn
ordinal_encoder = OrdinalEncoder(categories=[['High School', 'Bachelor', 'Master', 'PhD']])
education_ordinal = ordinal_encoder.fit_transform(education_array)
education_ordinal_df = pd.DataFrame(education_ordinal, columns=['education_ordinal'], index=df.index)

# Display the scikit-learn transformations
print("Scikit-learn One-hot encoding:")
print(pd.concat([df['education'], education_onehot_df], axis=1).head())

print("\nScikit-learn Ordinal encoding:")
print(pd.concat([df['education'], education_ordinal_df], axis=1).head())

## 6. Custom Transformations with apply() and map()

Creating custom transformations using pandas .apply() and .map() functions allows for complex logic and conditional transformations.

In [None]:
# Copy DataFrame for custom transformations
df_custom = df.copy()

# Simple lambda function with apply
df_custom['age_category'] = df_custom['age'].apply(
    lambda x: 'Young' if x < 30 else ('Middle-aged' if x < 50 else 'Senior')
)

# Custom function with multiple conditions
def salary_bonus(row):
    if row['education'] == 'PhD' and row['age'] < 40:
        return row['income'] * 0.2
    elif row['education'] in ['Master', 'Bachelor'] and row['age'] < 35:
        return row['income'] * 0.15
    else:
        return row['income'] * 0.1

df_custom['bonus_amount'] = df_custom.apply(salary_bonus, axis=1)

# Using map with a dictionary
title_map = {
    'PhD': 'Dr.',
    'Master': 'MSc.',
    'Bachelor': 'BSc.',
    'High School': 'Mr./Ms.'
}
df_custom['title'] = df_custom['education'].map(title_map)

# Combining string operations with apply
df_custom['formal_name'] = df_custom.apply(
    lambda row: f"{row['title']} {row['name']}", 
    axis=1
)

# Display results
print("Custom transformations results:")
df_custom[['name', 'age', 'age_category', 'education', 'income', 
          'bonus_amount', 'title', 'formal_name']].head()

In [None]:
# More complex custom transformations

# Function that evaluates multiple columns
def calculate_score(row):
    base_score = row['score']
    
    # Age adjustment
    if row['age'] < 30:
        age_factor = 1.1
    elif row['age'] < 50:
        age_factor = 1.0
    else:
        age_factor = 0.9
        
    # Education adjustment
    edu_factors = {
        'High School': 0.9,
        'Bachelor': 1.0,
        'Master': 1.1,
        'PhD': 1.2
    }
    edu_factor = edu_factors[row['education']]
    
    # Registration bonus
    reg_bonus = 5 if row['registered'] else 0
    
    final_score = base_score * age_factor * edu_factor + reg_bonus
    return round(final_score, 1)

df_custom['adjusted_score'] = df_custom.apply(calculate_score, axis=1)

# Display results of complex transformation
print("Complex custom transformation results:")
df_custom[['name', 'age', 'education', 'score', 'registered', 'adjusted_score']].head()

## 7. Date and Time Transformations

Date and time transformations are essential for time series analysis and feature engineering based on time data.

In [None]:
# Copy DataFrame for date transformations
df_date = df.copy()

# Extract components from datetime
df_date['join_year'] = df_date['join_date'].dt.year
df_date['join_month'] = df_date['join_date'].dt.month
df_date['join_day'] = df_date['join_date'].dt.day
df_date['join_dayofweek'] = df_date['join_date'].dt.dayofweek
df_date['join_quarter'] = df_date['join_date'].dt.quarter

# Create time-based features
df_date['days_since_join'] = (pd.Timestamp.now() - df_date['join_date']).dt.days
df_date['is_weekend'] = df_date['join_date'].dt.dayofweek >= 5

# Format datetime as string
df_date['join_date_formatted'] = df_date['join_date'].dt.strftime('%B %d, %Y')

# Extract month name
df_date['join_month_name'] = df_date['join_date'].dt.month_name()

# Create a reference date and calculate difference
reference_date = pd.Timestamp('2020-06-15')
df_date['days_from_reference'] = (df_date['join_date'] - reference_date).dt.days

# Display results
print("Date transformations results:")
df_date[['name', 'join_date', 'join_year', 'join_month', 'join_month_name', 
        'join_day', 'join_dayofweek', 'join_quarter', 'days_since_join', 
        'is_weekend', 'join_date_formatted', 'days_from_reference']].head()

In [None]:
# More advanced date transformations

# Create a date range for demonstration
date_range = pd.date_range(start='2022-01-01', end='2022-12-31', freq='W')
date_df = pd.DataFrame({'date': date_range})

# Create cyclical features for time (useful for machine learning with time data)
date_df['day_of_year'] = date_df['date'].dt.dayofyear
date_df['month_sin'] = np.sin(2 * np.pi * date_df['date'].dt.month / 12)
date_df['month_cos'] = np.cos(2 * np.pi * date_df['date'].dt.month / 12)
date_df['day_sin'] = np.sin(2 * np.pi * date_df['date'].dt.dayofyear / 365)
date_df['day_cos'] = np.cos(2 * np.pi * date_df['date'].dt.dayofyear / 365)

# Fiscal quarter (assuming fiscal year starts in April)
date_df['fiscal_quarter'] = (date_df['date'].dt.month - 4) % 12 // 3 + 1

# Business days between dates
from pandas.tseries.offsets import BDay
date_df['business_days_since_start'] = date_df['date'].apply(
    lambda x: len(pd.date_range(start='2022-01-01', end=x, freq=BDay)) - 1
)

# Age binning based on timestamp
def timestamp_to_age_bucket(timestamp):
    age_days = (pd.Timestamp.now() - timestamp).days
    if age_days < 90:
        return "Recent (< 3 months)"
    elif age_days < 180:
        return "Medium (3-6 months)"
    else:
        return "Old (> 6 months)"

date_df['age_bucket'] = date_df['date'].apply(timestamp_to_age_bucket)

# Display the first few rows
print("Advanced date transformations:")
date_df.head()

## 8. Scaling and Normalization

Scaling and normalization are crucial for many machine learning algorithms to ensure features are on similar scales and distributions.

In [None]:
# Copy numeric columns for scaling
numeric_df = df[['age', 'income', 'score']].copy()

# Min-Max Scaling (normalization) - scales values to [0,1] range
scaler_minmax = MinMaxScaler()
numeric_df_minmax = pd.DataFrame(
    scaler_minmax.fit_transform(numeric_df), 
    columns=[col + '_minmax' for col in numeric_df.columns],
    index=numeric_df.index
)

# Standardization (Z-score normalization) - mean=0, std=1
scaler_standard = StandardScaler()
numeric_df_standard = pd.DataFrame(
    scaler_standard.fit_transform(numeric_df),
    columns=[col + '_standard' for col in numeric_df.columns],
    index=numeric_df.index
)

# Robust Scaling - scales using median and quantiles, robust to outliers
scaler_robust = RobustScaler()
numeric_df_robust = pd.DataFrame(
    scaler_robust.fit_transform(numeric_df),
    columns=[col + '_robust' for col in numeric_df.columns],
    index=numeric_df.index
)

# Manual scaling methods
numeric_df_manual = numeric_df.copy()
# Manual min-max scaling
numeric_df_manual['age_manual_minmax'] = (numeric_df_manual['age'] - numeric_df_manual['age'].min()) / (numeric_df_manual['age'].max() - numeric_df_manual['age'].min())
# Manual standardization
numeric_df_manual['income_manual_standard'] = (numeric_df_manual['income'] - numeric_df_manual['income'].mean()) / numeric_df_manual['income'].std()
# Log transformation and then scaling
numeric_df_manual['income_log_scaled'] = np.log1p(numeric_df_manual['income'])  # log1p avoids issues with zero values
numeric_df_manual['income_log_scaled'] = (numeric_df_manual['income_log_scaled'] - numeric_df_manual['income_log_scaled'].min()) / (numeric_df_manual['income_log_scaled'].max() - numeric_df_manual['income_log_scaled'].min())

# Combine all scaled dataframes
all_scaled = pd.concat([numeric_df, numeric_df_minmax, numeric_df_standard, numeric_df_robust, 
                       numeric_df_manual[['age_manual_minmax', 'income_manual_standard', 'income_log_scaled']]], axis=1)

# Display the results
print("Scaling and normalization results:")
all_scaled.head()

In [None]:
# Visualize the effect of different scaling methods on the income feature

plt.figure(figsize=(15, 10))

plt.subplot(2, 2, 1)
sns.histplot(numeric_df['income'], kde=True)
plt.title('Original Income Distribution')

plt.subplot(2, 2, 2)
sns.histplot(numeric_df_minmax['income_minmax'], kde=True)
plt.title('Min-Max Scaled Income')

plt.subplot(2, 2, 3)
sns.histplot(numeric_df_standard['income_standard'], kde=True)
plt.title('Standardized Income')

plt.subplot(2, 2, 4)
sns.histplot(numeric_df_robust['income_robust'], kde=True)
plt.title('Robust Scaled Income')

plt.tight_layout()
plt.show()

# Visualize a comparison of all scaling methods for the income feature
plt.figure(figsize=(12, 6))
plt.plot(all_scaled['income'], label='Original')
plt.plot(all_scaled['income_minmax'], label='Min-Max')
plt.plot(all_scaled['income_standard'], label='Standard')
plt.plot(all_scaled['income_robust'], label='Robust')
plt.plot(all_scaled['income_manual_standard'], label='Manual Standard')
plt.plot(all_scaled['income_log_scaled'], label='Log Scaled')
plt.title('Comparison of Scaling Methods for Income')
plt.legend()
plt.grid(True)
plt.show()

## Summary

In this notebook, we've explored a variety of data transformation techniques:

1. **Basic Data Transformations**: Adding/removing columns, renaming, changing data types
2. **Mathematical Transformations**: Log, square root, polynomial transformations
3. **String Transformations**: Text cleaning, pattern extraction, string manipulation
4. **Categorical Data Transformations**: One-hot encoding, label encoding, ordinal encoding
5. **Custom Transformations**: Using apply() and map() for complex transformations
6. **Date and Time Transformations**: Extracting components, formatting, calculating differences
7. **Scaling and Normalization**: Min-max scaling, standardization, robust scaling

These transformations are essential steps in the data preprocessing pipeline and help prepare data for analysis and modeling.