In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import skew
from collections import Counter
from scipy import stats
import time
import re

In [2]:
%%time
data = pd.read_csv("cars.csv")

CPU times: user 1.48 s, sys: 151 ms, total: 1.63 s
Wall time: 1.66 s


In [3]:
# drop row when more than 25% of the data is missing
print("Before dropping irrelevant rows:", data.shape)
data.dropna(thresh=14, inplace=True)
print("After dropping irrelevant rows:", data.shape)

Before dropping irrelevant rows: (762091, 20)
After dropping irrelevant rows: (748211, 20)


In [4]:
# Replace 'data' with your actual data and feature name
def visualize_feature_distribution_with_chart(data, feature_name,top_n):

  # Get the value counts for the chosen feature
  feature_counts = data[feature_name].value_counts()

  # Create a DataFrame to store top n and "Others"
  top_n_s = feature_counts.head(top_n)
  top_n_s['Others'] = feature_counts[top_n:].sum()

  # Print a table summarizing the distribution
  print(f"\nDistribution of {feature_name} (Top {top_n} + Others):")
  print(top_n_s.to_string())

  # Create the bar chart
  plt.figure(figsize=(3, 2))
  sns.barplot(x=top_n_s.index, y=top_n_s.values)
  plt.title(f'Distribution of {feature_name} (Top {top_n} + Others)')
  plt.xlabel(feature_name)
  plt.ylabel('Frequency')
  plt.xticks(rotation=90)  # Rotate x-axis labels for readability
  plt.show()
    
    
def feature_distribution(data, feature_name, top_n):

    # Get the value counts for the chosen feature
    feature_counts = data[feature_name].value_counts()

    # Calculate total count
    total_count = feature_counts.sum()

    # Create a DataFrame to store top n and "Others"
    top_n_s = feature_counts.head(top_n)
    top_n_s['Others'] = feature_counts[top_n:].sum()

    # Calculate percentages with formatting directly in the column
    top_n_s['Percentage'] = (top_n_s / total_count) * 100
    top_n_s['Percentage'] = top_n_s['Percentage'].astype(str) + '%'

    # Print a table summarizing the distribution
    print(f"\nDistribution of {feature_name} (Top {top_n} + Others):")
    print(top_n_s['Percentage'])

In [5]:
'''
Feature: manufacturer
Approach:  No missing values, one-hot-encode.
'''
#get distribution
#visualize_feature_distribution_with_chart(data, 'manufacturer',30) 

print("Unique values:", len(data['manufacturer'].unique()))
print("Unique values of manufacturer:", data['manufacturer'].unique())

one_hot_encoded = pd.get_dummies(data['manufacturer'], prefix='manufacturer')
one_hot_encoded = one_hot_encoded.astype(int)
data.drop('manufacturer', axis=1, inplace=True)
data = pd.concat([data, one_hot_encoded], axis=1)

Unique values: 30
Unique values of manufacturer: ['Acura' 'Audi' 'BMW' 'Buick' 'Cadillac' 'Chevrolet' 'Chrysler' 'Dodge'
 'Ford' 'GMC' 'Honda' 'Hyundai' 'INFINITI' 'Jaguar' 'Jeep' 'Kia'
 'Land Rover' 'Lexus' 'Lincoln' 'Mazda' 'Mercedes-Benz' 'Mitsubishi'
 'Nissan' 'Porsche' 'RAM' 'Subaru' 'Tesla' 'Toyota' 'Volkswagen' 'Volvo']


In [6]:
'''
Feature: model
Approach:
'''


model_names = [m.split()[0] for m in data['model']]
data['model'] = model_names

print("Unique values:", len(data['model'].unique()))
print("Unique values of manufacturer:", data['model'].unique())


# for val in data['model'].unique():
#     print(val)


one_hot_encoded = pd.get_dummies(data['model'], prefix='model')
one_hot_encoded = one_hot_encoded.astype(int)
data.drop('model', axis=1, inplace=True)
data = pd.concat([data, one_hot_encoded], axis=1)

Unique values: 742
Unique values of manufacturer: ['ILX' 'NSX' 'ZDX' 'RSX' 'Legend' 'MDX' 'RLX' 'Integra' 'TL' 'RDX' 'TSX'
 'CL' 'TLX' 'RL' 'A8' 'RS' 'Q8' 'e-tron' 'R8' 'Q7' 'RS6' 'Q5' 'S5' 'S4'
 'S3' 'A3' 'A4' 'SQ7' 'A5' 'TT' 'A6' 'TTS' 'Q3' 'S8' 'S6' 'Q4' 'S7' 'A7'
 'SQ5' '228' '650' '530e' '745e' '640' '840' 'M8' 'X4' 'M6' '850' '430'
 '328' '128' '325' '428' 'ALPINA' '340' '323' '740' 'Z3' 'M' 'Z4' '528'
 'M240' '440' '535' 'X3' 'X6' '330' 'X1' '550' '135' 'X7' 'i8' '645'
 'M440' 'i7' 'M760' '435' 'M235' 'M340' 'M850' 'M3' 'M4' '230' '740e'
 '535d' 'M5' 'M2' 'i4' '745' 'Z8' '335' '525' '318' '2002' '320' '540'
 'X2' '750' 'X5' '330e' '760' '530' 'ActiveHybrid' 'M550' '328d' 'iX' 'i3'
 'Electra' 'Centurion' 'Enclave' 'GSX' 'Envision' 'Lucerne' 'Cascada'
 'LeSabre' 'Encore' 'Riviera' 'Terraza' 'Century' 'Roadmaster' 'Super'
 'Rendezvous' 'Regal' 'Skylark' 'Estate' 'Limited' 'Special' 'Verano'
 'Model' 'Rainier' 'Reatta' 'Park' 'LaCrosse' 'GS' 'Eldorado' 'CT5-V'
 'XT4' 'CTS-V' 'Escala

In [None]:
'''
Feature: year
Approach: No missing values, one-hot-encode.
'''

year_values = data['year'].values

# calculate missing values
missing_count = np.isnan(year_values).sum()
total = len(year_values);
print("Missing values:", missing_count, "(" , (missing_count / total) * 100, "%)")

# calculate unique values
unique_values = len(np.unique(year_values))
print("Unique Values:", unique_values)

# One-hot-encoding
one_hot_encoded = pd.get_dummies(data['year'], prefix='year')
one_hot_encoded = one_hot_encoded.astype(int)
data.drop('year', axis=1, inplace=True)
data = pd.concat([data, one_hot_encoded], axis=1)

In [None]:
'''
Feature: mileage
Approach: Replace all missing values with the median and perform z-scaling.
'''

mileage_values = data['mileage'].values

# calculate missing values
missing_count = np.isnan(mileage_values).sum()
total = len(mileage_values);
print("Missing values:", missing_count, "(" , (missing_count / total) * 100, "%)")

mileage_values_without_nan = mileage_values[~np.isnan(mileage_values)]

# calculate skewness
skewness = skew(mileage_values_without_nan)
print("Skewness of 'mileage':", skewness)

# Since distribution is heavily skewed towards the left (skewness < -1) => replace missing values with median

# calculate median
median = np.nanmedian(mileage_values)

# fill missing values with median
data['mileage'].fillna(median, inplace=True)

# calculate mean and std
mean = np.mean(mileage_values)
std = np.std(mileage_values)

print("Mean:", mean)
print("Median:", median)
print("Standard Deviation:", std)

# Z-Scaling
scaled_mileage_values = (mileage_values - mean) / std

data['mileage'] = scaled_mileage_values

In [None]:
%%time
'''
Feature: engine
Approach:
'''

def extract_horsepower(engine_type):
    if pd.isna(engine_type):
        return np.nan
    horsepower = re.search(r'(\d+HP)', engine_type)
    if horsepower:
        return int(horsepower.group(1)[:-2])
    else:
        return np.nan

def extract_liters(engine_type):
    if pd.isna(engine_type):
        return np.nan
    liters = re.search(r'(\d+\.\d+)L\b', engine_type)
    if liters:
        return float(liters.group(1))
    else:
        return np.nan

def is_turbo(engine_type):
    if pd.isna(engine_type):
        return 0
    if "Turbo" in engine_type:
        return 1
    else:
        return 0

def extract_cylinders(engine_type):
    if pd.isna(engine_type):
        return np.nan
    cylinders = re.search(r'\b(I-\d+|V\d+)\b', engine_type)
    if cylinders:
        cylinders_number = cylinders.group(0)[2:]
        if cylinders_number.isdigit():  # Check if it's a valid integer
            return int(cylinders_number)
        else:
            return np.nan  # Return NaN if it's not a valid integer
    else:
        return np.nan  # Return NaN if the pattern is not found


def extract_injection_type(engine_type):
    if pd.isna(engine_type):
        return np.nan
    injection_type = re.search(r'(MPFI|GDI|SPFI|PGM-FI|DI|SIDI|TFSI|FSI)', engine_type)
    if injection_type:
        return injection_type.group(0)
    else:
        return np.nan

def z_scaling(data):
    mean = data.mean()
    std = data.std()
    return (data - mean) / std

def handle_missing_values(data):
    for column in data.columns:
        if data[column].dtype == 'object':  # Check if the column is non-numeric
            mode_value = data[column].mode().iloc[0]  # Calculate mode
            data[column].fillna(mode_value, inplace=True)  # Impute missing values with mode
        else:
            median_value = data[column].median()
            data[column].fillna(median_value, inplace=True)
    return data


data['horsepower'] = data['engine'].apply(extract_horsepower)
data['liters'] = data['engine'].apply(extract_liters)
data['turbo'] = data['engine'].apply(is_turbo)
data['cylinders'] = data['engine'].apply(extract_cylinders)
data['injection_type'] = data['engine'].apply(extract_injection_type)

data = handle_missing_values(data)

# Z-scaling
# data['horsepower'] = z_scaling(data['horsepower'])
# data['liters'] = z_scaling(data['liters'])
# data['cylinders'] = z_scaling(data['cylinders'])

# One-hot-encoding
one_hot_encoded = pd.get_dummies(data['injection_type'], prefix='injection_type')
one_hot_encoded = one_hot_encoded.astype(int)
data.drop('injection_type', axis=1, inplace=True)
data.drop('engine', axis=1, inplace=True)
data = pd.concat([data, one_hot_encoded], axis=1)



In [None]:
'''
Feature: transmission
Approach: Change all transmisson to the right classification and extract the number of speeds in a new column. onehot encoding
'''
import re

def get_transmission_info(transmission):
    transmission_type = "Unknown"
    number_of_speeds = "Unknown"
    
    if isinstance(transmission, str):
        transmission = transmission.lower()
        
      # Use a regular expression to find the number of speeds
       
        if "cvt" in transmission or "shiftronic" in transmission or "continuously" in transmission:
            transmission_type = "Automatic Continuously Variable"
        
        elif "dual" in transmission or "dct" in transmission:
            transmission_type = "Dual Clutch"
    
        elif "semi" in transmission or "autostick" in transmission or "paddle" in transmission:
            transmission_type = "Semi-Automatic"
        
        elif "manual" in transmission or " m " in transmission:
            transmission_type = "Manual"
        
        elif "automatic" in transmission or "auto" in transmission or "a/t" in transmission or " a " in transmission or " at" in transmission :
            transmission_type = "Automatic"
            
       
    
    
        speeds = re.search(r'(\d+)\s*-?\s*(spd|speed)', transmission)
        if speeds:
            number_of_speeds = speeds.group(1) + "-speed"
            
    return (transmission_type, number_of_speeds)

data["transmission"], data["number_of_speeds"] = zip(*data["transmission"].apply(get_transmission_info))

print("Transmission unique values:", len(data['transmission'].unique()))


one_hot_encoded = pd.get_dummies(data['transmission'], prefix='transmission')
one_hot_encoded = one_hot_encoded.astype(int)
data.drop('transmission', axis=1, inplace=True)
data = pd.concat([data, one_hot_encoded], axis=1)

print("Number of speeds unique values:", len(data['number_of_speeds'].unique()))

one_hot_encoded = pd.get_dummies(data['number_of_speeds'], prefix='number_of_speeds')
one_hot_encoded = one_hot_encoded.astype(int)
data.drop('number_of_speeds', axis=1, inplace=True)
data = pd.concat([data, one_hot_encoded], axis=1)

In [None]:
'''
Feature: drivetrain
Approach: Got 4 most dominant drivetrains, put rest and missing values in 'other'. One-hot-encoding.
'''

def process_drivetrain(drivetrain):    
    if isinstance(drivetrain, str):
        drivetrain = drivetrain.lower()
        if "front" in drivetrain or "fwd" in drivetrain:
            return "Front-Wheel Drive"
        elif "all" in drivetrain or "awd" in drivetrain:
            return "All-Wheel Drive"
        elif "four" in drivetrain or "4" in drivetrain:
            return "Four-Wheel Drive"
        elif "rear" in drivetrain or "rwd" in drivetrain:
            return "Rear-Wheel Drive"
        else:
            return 'other'
    else:
        return 'other'
        

# calculate missing values
missing_count = data['drivetrain'].isnull().sum()
total = len(data['drivetrain'])
print("Missing values:", missing_count, "(" , (missing_count / total) * 100, "%)")

data['drivetrain'] = data['drivetrain'].apply(process_drivetrain)

# calculate 'not' missing values which will be added to others 
other_count = (data['drivetrain'] == 'other').sum()
print("'Other' count excluding missing:", other_count - missing_count)

# calculate unique values
unique_values = len(data['drivetrain'].unique())
print("Unique Values:", unique_values)

# One-hot-encoding
one_hot_encoded = pd.get_dummies(data['drivetrain'], prefix='drivetrain')
one_hot_encoded = one_hot_encoded.astype(int)
data.drop('drivetrain', axis=1, inplace=True)
data = pd.concat([data, one_hot_encoded], axis=1)

In [None]:
'''
Feature: fuel_type
Approach: Got the top 6 fuel types. Put the rest and missing values into 'other'. One-hot-encoding.
'''

def process_fuel_type(fuel):
        if isinstance(fuel, str):
            fuel = fuel.lower()
        else:
            fuel = 'other'
        if fuel in fuel_types:
            return fuel
        else:
            return 'other'

        
# calculate missing values
missing_count = data['fuel_type'].isnull().sum()
total = len(data['fuel_type'])
print("Missing values:", missing_count, "(" , (missing_count / total) * 100, "%)")

fuel_types = ['gasoline', 'hybrid', 'diesel', 'e85 flex fuel', 'electric', 'b']

data['fuel_type'] = data['fuel_type'].apply(process_fuel_type)

# calculate 'not' missing values which will be added to others 
other_count = (data['fuel_type'] == 'other').sum()
print("'Other' count excluding missing:", other_count - missing_count)

# calculate unique values
unique_values = len(data['fuel_type'].unique())
print("Unique Values:", unique_values)

# One-hot-encoding
one_hot_encoded = pd.get_dummies(data['fuel_type'], prefix='fuel_type')
one_hot_encoded = one_hot_encoded.astype(int)
data.drop('fuel_type', axis=1, inplace=True)
data = pd.concat([data, one_hot_encoded], axis=1)

In [None]:
'''
Feature: mpg
Approach: Replace all missing values with the median and perform z-scaling.
'''

def calculate_middle_value(mpg_values):
    updated_values = []
    for value in mpg_values:
        if isinstance(value, str) and '-' in value:
            start, end = map(float, value.split('-'))
            updated_values.append((start + end) / 2)
        else:
            updated_values.append(value)
    return pd.Series(updated_values)


mpg_values = data['mpg'].values

# Update range with middle value
mpg_values = calculate_middle_value(data['mpg'])

# Convert to numeric, handle errors by coercing to NaN
mpg_values = pd.to_numeric(mpg_values, errors='coerce')

# Replace missing values with median
mean = np.nanmean(mpg_values)
mpg_values.fillna(mean, inplace=True)

# Calculate skewness
skewness = skew(mpg_values)
print("Skewness of 'mpg':", skewness)

# Calculate mean and standard deviation
mean = np.nanmean(mpg_values)
std = np.nanstd(mpg_values)

# Z-Scaling
scaled_mpg_values = (mpg_values - mean) / std

data['mpg'] = scaled_mpg_values


# Ensure all missing values are replaced with mean
missing_count = np.isnan(data['mpg']).sum()
if missing_count > 0:
    mean_mpg = data['mpg'].mean()
    print("Replacing remaining missing values with mean:", mean_mpg)
    data['mpg'].fillna(mean_mpg, inplace=True)

print("Number of missing values in 'mpg' column:", data['mpg'].isnull().sum())

In [None]:
'''
Feature: exterior_color
Approach: Categorize top 30 colors, others in 'other', one-hot-encoding.
'''

# Create a list of words from the 'exterior_color' column
words = [word.lower() for item in data['exterior_color'] for word in str(item).split()]

# Count the frequency of each word
word_freq = Counter(words)

# Sort the Counter dictionary by frequency in descending order and limit to top 30
sorted_freq = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:30]

top_30_colors = [word for word, _ in sorted_freq]

def replace_words(item):
    words = str(item).split()
    for word in words:
        if word.lower() in top_30_colors:
            return word.lower()  
    return 'other'


# calculate missing values
missing_count = data['exterior_color'].isnull().sum()
total = len(data['exterior_color'])
print("Missing values:", missing_count, "(" , (missing_count / total) * 100, "%)")

data['exterior_color'] = data['exterior_color'].apply(replace_words)

# calculate 'not' missing values which will be added to others 
other_count = (data['exterior_color'] == 'other').sum()
print("'Other' count excluding missing:", other_count - missing_count)

# calculate unique values
unique_values = len(data['exterior_color'].unique())
print("Unique Values:", unique_values)

# One-hot-encoding
one_hot_encoded = pd.get_dummies(data['exterior_color'], prefix='exterior_color')
one_hot_encoded = one_hot_encoded.astype(int)
data.drop('exterior_color', axis=1, inplace=True)
data = pd.concat([data, one_hot_encoded], axis=1)

In [None]:
'''
Feature: interior_color
Approach: Categorize top 20 colors, others in 'other', one-hot-encoding.
'''

# Create a list of words from the 'interior_color' column
words = [word.lower() for item in data['interior_color'] for word in str(item).split()]

# Count the frequency of each word
word_freq = Counter(words)

# Sort the Counter dictionary by frequency in descending order and limit to top 20
sorted_freq = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:20]

top_20_colors = [word for word, _ in sorted_freq]

def replace_words(item):
    words = str(item).split()
    for word in words:
        if word.lower() in top_20_colors:
            return word.lower()  
    return 'other'


# calculate missing values
missing_count = data['interior_color'].isnull().sum()
total = len(data['interior_color'])
print("Missing values:", missing_count, "(" , (missing_count / total) * 100, "%)")

data['interior_color'] = data['interior_color'].apply(replace_words)

# calculate 'not' missing values which will be added to others 
other_count = (data['interior_color'] == 'other').sum()
print("'Other' count excluding missing:", other_count - missing_count)

# calculate unique values
unique_values = len(data['interior_color'].unique())
print("Unique Values:", unique_values)

# One-hot-encoding
one_hot_encoded = pd.get_dummies(data['interior_color'], prefix='interior_color')
one_hot_encoded = one_hot_encoded.astype(int)
data.drop('interior_color', axis=1, inplace=True)
data = pd.concat([data, one_hot_encoded], axis=1)

In [None]:
'''
Feature: accidents_or_damage
Approach: Split into 3 new columns (true, false, missing)
'''

# calculate missing values
missing_count = data['accidents_or_damage'].isnull().sum()
total = len(data['accidents_or_damage'])
print("Missing values:", missing_count, "(" , (missing_count / total) * 100,"%)")

# change nan to a value so it could be categorized
data['accidents_or_damage'] = data['accidents_or_damage'].fillna('missing')

# one-hot-encode
one_hot_encoded = pd.get_dummies(data['accidents_or_damage'], prefix='accidents_or_damage')
one_hot_encoded = one_hot_encoded.astype(int)
data.drop('accidents_or_damage', axis=1, inplace=True)
data = pd.concat([data, one_hot_encoded], axis=1)

In [None]:
'''
Feature: one_owner
Approach: Split into 3 new columns (true, false, missing)
'''

# calculate missing values
missing_count = data['one_owner'].isnull().sum()
total = len(data['one_owner'])
print("Missing values:", missing_count, "(" , (missing_count / total) * 100,"%)")

# change nan to a value so it could be categorized
data['one_owner'] = data['one_owner'].fillna('missing')

# one-hot-encode
one_hot_encoded = pd.get_dummies(data['one_owner'], prefix='one_owner')
one_hot_encoded = one_hot_encoded.astype(int)
data.drop('one_owner', axis=1, inplace=True)
data = pd.concat([data, one_hot_encoded], axis=1)

In [None]:
'''
Feature: personal_use_only
Approach: Split into 3 new columns (true, false, missing)
'''

# calculate missing values
missing_count = data['personal_use_only'].isnull().sum()
total = len(data['personal_use_only'])
print("Missing values:", missing_count, "(" , (missing_count / total) * 100,"%)")

# change nan to a value so it could be categorized
data['personal_use_only'] = data['personal_use_only'].fillna('missing')

# one-hot-encode
one_hot_encoded = pd.get_dummies(data['personal_use_only'], prefix='personal_use_only')
one_hot_encoded = one_hot_encoded.astype(int)
data.drop('personal_use_only', axis=1, inplace=True)
data = pd.concat([data, one_hot_encoded], axis=1)

In [None]:
'''
Feature: seller_name
Approach: Drop feature
'''

data.drop('seller_name', axis=1, inplace=True)

In [None]:
'''
Feature: seller_rating
Approach: Replace all missing values with 0 and perform z-scaling.
'''

seller_rating_values = data['seller_rating'].values

# Calculate missing values rate
missing_count = np.isnan(seller_rating_values).sum()
total = len(seller_rating_values)
print("Missing values:", missing_count, "(", (missing_count / total) * 100, "%)")

# Fill missing values with 0 (not calculating skewness as we're not using it for imputation)
data['seller_rating'].fillna(0, inplace=True)

# Calculate mean and standard deviation
mean = np.mean(seller_rating_values)
std = np.std(seller_rating_values)

print("Mean:", mean)
print("Standard Deviation:", std)

# Perform z-scaling
scaled_seller_rating_values = (seller_rating_values - mean) / std

data['seller_rating'] = scaled_seller_rating_values

In [None]:
'''
Feature: driver_rating
Approach: Replace all missing values with the median and perform z-scaling.
'''

driver_rating_values = data['driver_rating'].values

# calculate missing values rate
missing_count = np.isnan(driver_rating_values).sum()
total = len(driver_rating_values);
print("Missing values:", missing_count, "(" , (missing_count / total) * 100,"%)")

driver_rating_values_without_nan = driver_rating_values[~np.isnan(driver_rating_values)]

# calculate skewness
skewness = skew(driver_rating_values_without_nan)
print("Skewness of 'driver_rating_values':", skewness)

# Since distribution is heavily skewed towards the left (skewness < -1) => replace missing values with median

# calculate median
median = np.nanmedian(driver_rating_values)

# fill missing values with median
data['driver_rating'].fillna(median, inplace=True)

# calculate mean and std
mean = np.mean(driver_rating_values)
std = np.std(driver_rating_values)

print("Mean:", mean)
print("Median:", median)
print("Standard Deviation:", std)

# (x - mean) / std
scaled_driver_rating_values = (driver_rating_values - mean) / std

data['driver_rating'] = scaled_driver_rating_values

In [None]:
'''
Feature: driver_reviews_num
Approach: No missing values, perform z-scaling.
'''

driver_reviews_values = data['driver_reviews_num'].values

print(len(np.unique(driver_reviews_values)))

# calculate missing values rate
missing_count = np.isnan(driver_reviews_values).sum()
total = len(driver_reviews_values);
print("Missing values:", missing_count, "(" , (missing_count/total) * 100,"%)")

# calculate mean and std
mean = np.mean(driver_reviews_values)
std = np.std(driver_reviews_values)

print("Mean:", mean)
print("Standard Deviation:", std)

# (x - mean) / std
scaled_driver_reviews_values = (driver_reviews_values - mean) / std

data['driver_reviews_num'] = scaled_driver_reviews_values

In [None]:
'''
Name: 
Feature: price_drop
Approach: Drop feature
'''

data.drop('price_drop', axis=1, inplace=True)

In [None]:
# for percentile in range(0, 101, 1):
#     value_at_percentile = data['price'].quantile(percentile / 100)
#     print(f"Value at {percentile}th percentile:", value_at_percentile)

# print()

low = data['price'].quantile(0.01)
high = data['price'].quantile(0.99)

print("Value at 1th percentile:", low)
print("Value at 99th percentile:", high)

data['price'] = data['price'].clip(3000, 120000)

In [None]:
for column in data.columns:
    if data[column].dtype in ['int64', 'float64']:  # Check if column contains numeric values
        if data[column].nunique() > 2:  # Check if column has more than 2 unique values (excluding 0's and 1's)
            print(f"Column Name: {column}")
            print(f"Max Value before transformation: {data[column].max()}")
            print(f"Min Value before transformation: {data[column].min()}")
            
            # Apply transformations
            if data[column].dtype == 'float64' and column != 'price':
                data[column] = data[column].clip(lower=-2, upper=2)
            
            print(f"Max Value after transformation: {data[column].max()}")
            print(f"Min Value after transformation: {data[column].min()}")
            print()


In [None]:
data
data.reset_index(inplace=True, drop=True) 

In [None]:
print("Missing values in dataset:", data.isna().sum().sum())

In [None]:
data.head()

In [None]:
data.to_csv('cleaned_data.csv', index=False)

<h2> Training Models<h2>

In [None]:
import sklearn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import time

In [None]:
print(data.shape)

In [None]:
# for i in range (0, 1000):
#     print(data.columns[i])

In [None]:
X = data.drop(columns=['price'])
y = data['price']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

<h2>KNN<h2>

<h2>Linear Regression<h2>

<h2>Decision Tree<h2>

<h2>Random Forests<h2>

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
model = RandomForestRegressor(n_estimators=20, random_state=0)

In [None]:
%%time

model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
r2 = r2_score(y_test, y_pred)
print("R-squared score:", r2)