# Exploratory Data Analysis

This notebook focuses on the exploratory analysis of the [Vehicles.csv](https://www.kaggle.com/datasets/austinreese/craigslist-carstrucks-data) dataset. It aims to uncover initial insights and patterns through visualizations and statistical summaries by understanding feature distributions and examining relationships between features.

By the end of this notebook, we will have a thorough understanding of the dataset’s characteristics and the interactions between features, setting the stage for effective model development.

In [None]:
import dask.dataframe as dd
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Function to optimize memory usage
def optimize_memory(df):
    for column in df.select_dtypes(include=['float']):
        df[column] = pd.to_numeric(df[column], downcast='float')
    for column in df.select_dtypes(include=['int']):
        df[column] = pd.to_numeric(df[column], downcast='integer')
    for column in df.select_dtypes(include=['object']):
        num_unique_values = len(df[column].unique())
        num_total_values = len(df[column])
        if num_unique_values / num_total_values < 0.5:
            df[column] = df[column].astype('category')
    return df

# Specify dtypes for columns to avoid dtype inference issues
dtype_spec = {
    'VIN': 'object',
    'condition': 'object',
    'cylinders': 'object',
    'description': 'object',
    'drive': 'object',
    'fuel': 'object',
    'image_url': 'object',
    'manufacturer': 'object',
    'model': 'object',
    'paint_color': 'object',
    'posting_date': 'object',
    'size': 'object',
    'title_status': 'object',
    'transmission': 'object',
    'type': 'object'
}

# Read the CSV file using Dask with specified dtypes
df = dd.read_csv('../data/vehicles.csv', dtype=dtype_spec)

# Convert Dask DataFrame to Pandas DataFrame for preprocessing
df = df.compute()

# Optimize memory usage
df = optimize_memory(df)

# List of categorical columns
categorical_cols = ['manufacturer', 'model', 'condition', 'cylinders', 'fuel',
                    'title_status', 'transmission', 'drive', 'size', 'type', 'paint_color']

# One-hot encode categorical columns
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# List of numerical columns
numerical_cols = ['year', 'odometer', 'lat', 'long']

# Standardize numerical columns
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# Function to impute missing values using a RandomForestRegressor
def impute_missing_values(df, target_column):
    df_notnull = df[df[target_column].notnull()]
    df_null = df[df[target_column].isnull()]

    X_train = df_notnull.drop(columns=[target_column])
    y_train = df_notnull[target_column]

    model = RandomForestRegressor()
    model.fit(X_train, y_train)

    X_null = df_null.drop(columns=[target_column])
    predicted_values = model.predict(X_null)

    df.loc[df[target_column].isnull(), target_column] = predicted_values

    return df

# List of columns to impute
columns_with_missing_values = df.columns[df.isnull().any()]

# Impute missing values for each column
for column in columns_with_missing_values:
    df = impute_missing_values(df, column)

# Split the data into training and testing sets
X = df.drop(columns=['price'])  # replace 'price' with your target variable
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a RandomForestRegressor model
final_model = RandomForestRegressor()
final_model.fit(X_train, y_train)

# Evaluate the model
y_pred = final_model.predict(X_test)

print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R2 Score:", r2_score(y_test, y_pred))

In [None]:
try:
    print('Script Executed Successfully')
except:
    print('FAILED')