In [356]:
import pandas as pd

In [357]:
df = pd.read_csv("car_data.csv")
df.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Driven_kms,Fuel_Type,Selling_type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


In [358]:
# Check for any missing values
print(df.isnull().sum())


Car_Name         0
Year             0
Selling_Price    0
Present_Price    0
Driven_kms       0
Fuel_Type        0
Selling_type     0
Transmission     0
Owner            0
dtype: int64


In [359]:
# Use one-hot encoding for 'Fuel_Type', 'Selling_type', and 'Transmission' (not numeric columns)
car_data_encoded = pd.get_dummies(df, columns=['Fuel_Type', 'Selling_type', 'Transmission'], drop_first=True)

# Drop 'Car_Name' since it's not useful for prediction
car_data_encoded = car_data_encoded.drop(columns=['Car_Name'])

# Display the new dataframe after encoding
car_data_encoded.head()


Unnamed: 0,Year,Selling_Price,Present_Price,Driven_kms,Owner,Fuel_Type_Diesel,Fuel_Type_Petrol,Selling_type_Individual,Transmission_Manual
0,2014,3.35,5.59,27000,0,False,True,False,True
1,2013,4.75,9.54,43000,0,True,False,False,True
2,2017,7.25,9.85,6900,0,False,True,False,True
3,2011,2.85,4.15,5200,0,False,True,False,True
4,2014,4.6,6.87,42450,0,True,False,False,True


In [360]:
car_data_encoded = car_data_encoded.astype(int)
car_data_encoded

Unnamed: 0,Year,Selling_Price,Present_Price,Driven_kms,Owner,Fuel_Type_Diesel,Fuel_Type_Petrol,Selling_type_Individual,Transmission_Manual
0,2014,3,5,27000,0,0,1,0,1
1,2013,4,9,43000,0,1,0,0,1
2,2017,7,9,6900,0,0,1,0,1
3,2011,2,4,5200,0,0,1,0,1
4,2014,4,6,42450,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...
296,2016,9,11,33988,0,1,0,0,1
297,2015,4,5,60000,0,0,1,0,1
298,2009,3,11,87934,0,0,1,0,1
299,2017,11,12,9000,0,1,0,0,1


In [361]:
# Define features (X) 
X = car_data_encoded.drop(columns=['Selling_Price'])  # Features
X.head()

Unnamed: 0,Year,Present_Price,Driven_kms,Owner,Fuel_Type_Diesel,Fuel_Type_Petrol,Selling_type_Individual,Transmission_Manual
0,2014,5,27000,0,0,1,0,1
1,2013,9,43000,0,1,0,0,1
2,2017,9,6900,0,0,1,0,1
3,2011,4,5200,0,0,1,0,1
4,2014,6,42450,0,1,0,0,1


In [362]:
# Define target (y)

y = car_data_encoded['Selling_Price']  # Target (Selling Price)
y.head()

0    3
1    4
2    7
3    2
4    4
Name: Selling_Price, dtype: int64

In [363]:
from sklearn.model_selection import train_test_split

# Split the data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Print the shapes of the train and test datasets
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


(240, 8) (61, 8) (240,) (61,)


**LinearRegression**

In [364]:
from sklearn.linear_model import LinearRegression

# Create the decision tree model
linear_model = LinearRegression()

# Train the model
linear_model.fit(X_train, y_train)

# Make predictions
y_pred_tree = linear_model.predict(X_test)

# Evaluate the model
from sklearn.metrics import r2_score
score_linear = r2_score(y_test, y_pred_tree)
print(f'R-squared score (linear model): {score_linear}')

R-squared score (linear model): 0.6344543646782697


In [365]:
# Make predictions on the test set
y_pred = linear_model.predict(X_test)

# Show some predicted values alongside actual values
pd.DataFrame({'Actual': y_test, 'Predicted': y_pred}).head()


Unnamed: 0,Actual,Predicted
281,2,1.571299
289,10,7.92488
85,2,7.14176
255,3,2.029625
118,1,0.531157


In [366]:
# Example of a new car's feature data: [Year, Present_Price, Driven_kms, Owner, Fuel_Type_Diesel, Fuel_Type_Petrol, Selling_type_Individual, Transmission_Manual]
linear_model.predict([[2016, 8.5, 30000, 0, 1, 0, 0, 1]])  # # new values



array([7.72673761])

In [367]:
linear_model.predict([[2014,5,27000,0,0,1,0,1]])   # values from dataframe



array([3.40351788])

In [368]:
# from sklearn import tree
# model_D = tree.DecisionTreeClassifier()
# model_D.fit(X_train, y_train)
# model_D.score(X_test,y_test)   # output :  0.6229508196721312

**DecisionTreeRegressor**

In [369]:
from sklearn.tree import DecisionTreeRegressor

# Create the decision tree model
tree_model = DecisionTreeRegressor(random_state=42)

# Train the model
tree_model.fit(X_train, y_train)

# Make predictions
y_pred_tree = tree_model.predict(X_test)

# Evaluate the model
from sklearn.metrics import r2_score
score_tree = r2_score(y_test, y_pred_tree)
print(f'R-squared score (Decision Tree): {score_tree}')


R-squared score (Decision Tree): 0.9185596307527989


In [370]:
tree_model.predict([[2014,5,27000,0,0,1,0,1]]) # values from dataframe



array([4.])

In [371]:
tree_model.predict([[2016, 8.5, 30000, 0, 1, 0, 0, 1]])  # new values



array([6.])

**RandomForestRegressor**

In [372]:
from sklearn.ensemble import RandomForestRegressor

# Create the random forest model
forest_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
forest_model.fit(X_train, y_train)

# Make predictions
y_pred_forest = forest_model.predict(X_test)

# Evaluate the model
score_forest = r2_score(y_test, y_pred_forest)
print(f'R-squared score (Random Forest): {score_forest}')


R-squared score (Random Forest): 0.9511435734584501


In [373]:
forest_model.predict([[2014,5,27000,0,0,1,0,1]]) # values from dataframe



array([4.15])

In [374]:
forest_model.predict([[2016, 8.5, 30000, 0, 1, 0, 0, 1]]) # new values



array([7.31])

**COMPARISON**

In [375]:
print(f'R-squared score (linear model): {score_linear}')
print(f'R-squared score (Decision Tree): {score_tree}')
print(f'R-squared score (Random Forest): {score_forest}')

R-squared score (linear model): 0.6344543646782697
R-squared score (Decision Tree): 0.9185596307527989
R-squared score (Random Forest): 0.9511435734584501
