## Exercises large classes (Week_03)

## 1. Working with strings

### a) Import car dataset

In [None]:
# Iport the necessary libraries
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Read the data 'cars_autoscout24.csv' into a DataFrame
df = pd.read_csv('cars_autoscout24.csv', sep=';')

# Change column names to lowercase
df.columns = df.columns.str.lower()

# Display the first 5 rows of the DataFrame
df.head()

### b) Create a new variable 'str_len' containing the length of each string in the variable 'description'.

In [None]:
# Create a new variable 'str_len' containing the length of each string in the variable 'description'
df['str_len'] = df['description'].apply(len)

# Plot a histogram of the variable 'str_len'
plt.figure(figsize=(7, 5))
plt.hist(df['str_len'], bins=50, color='steelblue')
plt.title('Histogram of the variable "str_len"')
plt.xlabel('String length')
plt.ylabel('Frequency')
plt.grid()
plt.show()


### c) Create a new variable 'description_upper' from 'description' containing only uppercase letters

In [None]:
# Create a new variable 'description_upper' from 'description'
df['description_upper'] = df['description'].str.upper()

# Display the first 5 rows of the DataFrame
df[['description', 'description_upper']]

### d) Remove all leading and trailing empty spaces in 'description_upper'

In [None]:
# Remove all leading and trailing empty spaces in 'description_upper'
df['description_upper'] = df['description_upper'].str.strip()

# Display the first 5 rows of the DataFrame
df['description_upper'].head()

## 2.) Working with Regular expressions (regex)

### a) Extract the price as numerical value

In [None]:
# Function to extract numerical values from the price column
def extract_numerical_value(price):
    # Use regex to extract numerical values
    match = re.search(r"(\d+\'\d+)", price)
    if match:
        # Remove the thousands separator and convert to integer
        return float(match.group().replace("'", ''))
    else:
        return None

# Apply the function
df['price'] = df['price_raw'].apply(extract_numerical_value)

# Show pice_raw and price columns
df[['price_raw', 'price']].head()

### b) Extract the original price (germ.: Neupreis) from 'description_upper'

In [None]:
# Function to extract the original price
def extract_original_price(description):
    # Use regex to extract numerical values
    # Extract the numeric value of Neupreis
    neupreis_pattern = r'NEUPREIS:\s*CHF\s*([\d\'.]+)'

    # Extract the numeric value of Neupreis
    match = re.search(neupreis_pattern, description)

    if match:
        # Remove thousands separator (')
        return float(match.group(1).replace("'", ""))
    else:
        None

# Apply the function
df['price_original'] = df['description_upper'].apply(extract_original_price)

# Show the price and original price
df[['price', 'price_original']]

### c) Create a new binary variable 'occassion' with a value of '1' if Fahrzeugart Occasion and a value of '0' otherwise

In [None]:
# Create a new column with the number of occassion cars
df['occassion'] = df['description_upper'].str.contains(pat = 'OCCASION').astype(int)

# Number of occasion cars
print(df['occassion'].sum())

# Show the data
df.head()

## 3.) Working with pivot tables

### a) Create a subset of the data frame with all missing and duplicated values removed 

In [None]:
# Create a subset of the data without missing values and duplicates
df_sub = df.dropna().drop_duplicates()
df_sub.head()

# Check for missing values
print('Missing values per variable:')
print(df_sub.isna().sum())

# Check for duplicated values
print('\nDuplicated values:')
print(df_sub.duplicated().sum())

###  b) Create a pivot table

In [None]:
# Create a pivot table with:
# - 'Occassion' as index variable,
# - 'Price_numeric' and 'Price_orinigal' as values
# - np.mean (i.e. mean from the numpy library) as the aggregation function

# Create a pivot table
pivot_table = pd.pivot_table(
    df_sub[['price', 'price_original', 'occassion']],
    index='occassion',
    values=['price', 'price_original'],
    aggfunc='mean').round(1)

# Display the pivot table
pivot_table