In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


In [3]:
df = pd.read_csv("C:/Users/Muhammad Mitkar/Desktop/OIBSIP/task3car/car data.csv")  # Change path accordingly
df.head()  # View the first few rows


Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Driven_kms,Fuel_Type,Selling_type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


In [5]:
# Check for missing values
print(df.isnull().sum())

# Convert categorical columns into numerical values
df = pd.get_dummies(df, columns=['Fuel_Type', 'Selling_type', 'Transmission'], drop_first=True)

# Drop irrelevant columns
df = df.drop(['Car_Name'], axis=1)

# View updated data
print(df.head())


Car_Name         0
Year             0
Selling_Price    0
Present_Price    0
Driven_kms       0
Fuel_Type        0
Selling_type     0
Transmission     0
Owner            0
dtype: int64
   Year  Selling_Price  Present_Price  Driven_kms  Owner  Fuel_Type_Diesel  \
0  2014           3.35           5.59       27000      0             False   
1  2013           4.75           9.54       43000      0              True   
2  2017           7.25           9.85        6900      0             False   
3  2011           2.85           4.15        5200      0             False   
4  2014           4.60           6.87       42450      0              True   

   Fuel_Type_Petrol  Selling_type_Individual  Transmission_Manual  
0              True                    False                 True  
1             False                    False                 True  
2              True                    False                 True  
3              True                    False                 True  
4      

In [7]:
# Independent variables (features)
X = df.drop(['Selling_Price'], axis=1)

# Dependent variable (target)
y = df['Selling_Price']

# Splitting dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [9]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# Initialize and train model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on test data
y_pred = model.predict(X_test)


In [11]:
print(f"R2 Score: {r2_score(y_test, y_pred):.2f}")
print(f"Mean Squared Error: {mean_squared_error(y_test, y_pred):.2f}")


R2 Score: 0.85
Mean Squared Error: 3.48


In [13]:
# Define input features for prediction
new_car = pd.DataFrame({
    'Year': [2018],
    'Present_Price': [10.5],
    'Driven_kms': [25000],
    'Owner': [0],
    'Fuel_Type_Diesel': [0],  # Petrol means Diesel = 0, CNG = 0
    'Fuel_Type_Petrol': [1],  # Petrol = 1
    'Selling_type_Individual': [0],  # Dealer = 0, Individual = 1
    'Transmission_Manual': [1]  # Manual = 1, Automatic = 0
})

# Apply the same scaler used on training data
new_car_scaled = scaler.transform(new_car)

# Predict selling price
predicted_price = model.predict(new_car_scaled)
print(f"Predicted Selling Price: ₹{predicted_price[0]:.2f} lakhs")


Predicted Selling Price: ₹7.47 lakhs
