### 1) Import Packages

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#-------------------------------------SKLEARN Packages-------------------------
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

### 2) Read Data 

In [None]:
df = pd.read_csv("data/car_price.csv")
df.head()

In [None]:
df.info()

In [None]:
df.Fuel_Type.value_counts()

In [None]:
df.Transmission.value_counts()

In [None]:
df.Owner_Type.value_counts()

In [None]:
df.Seats.value_counts()

In [None]:
plt.scatter(df.Kilometers_Driven, df.Price);

In [None]:
df.Kilometers_Driven = df.Kilometers_Driven/1000

In [None]:
np.corrcoef(df.Kilometers_Driven, df.Price)

In [None]:
df.Kilometers_Driven.describe()

In [None]:
df['Kilometers_Driven'].plot.hist(bins=100, title='Frequency distribution ');

##### Outlier Detection

In [None]:
df.boxplot(column= ['Kilometers_Driven'])

#### Outliers - Detecting using IQR

![title](asset/percentile.png)

In [None]:
q75, q25 = np.percentile(df.Kilometers_Driven, [75,25])
iqr = q75 - q25
upper_whisker = q75 + 1.5 * iqr
lower_whisker = q25 - 1.5 * iqr

In [None]:
def check(x, ul, ll):
    if ul>=x>=ll:
        return x

In [None]:
print("Percentage of Outliers in tripduration:",len(df[df.Kilometers_Driven.apply(check, args = (upper_whisker, lower_whisker)).isnull()]['Kilometers_Driven'])/len(df) * 100)

In [None]:
def transform_kilometer(x):
    if x > upper_whisker:
        return upper_whisker
    return x

df['Kilometers_Driven'] = df['Kilometers_Driven'].apply(lambda x: transform_kilometer(x))
df['Kilometers_Driven'].plot.hist(bins=100, title='Frequency distribution after Outlier Treatment');

In [None]:
df.boxplot(column= ['Kilometers_Driven']);

In [None]:
np.corrcoef(df.Kilometers_Driven, df.Price)

In [None]:
sns.boxplot(x = df.Owner_Type , y = df['Price']);

### Model Building

In [None]:
df = pd.get_dummies(df)
df.head()

In [None]:
df.info()

In [None]:
features = df.drop('Price', axis=1 ) 
target = df['Price']
X = features.values
y = target.values

##### Train-Test Split

![](asset/tnn1.png)


![](asset/tnn2.png)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state = 50)

In [None]:
train_test_split?

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_train = lr.predict(X_train)
y_pred_test = lr.predict(X_test)

In [None]:
print("R2-Score-Training Data:", r2_score(y_train, y_pred_train))
print("R2-Score-Test Data:", r2_score(y_test, y_pred_test))


In [None]:
print("R2-Score-Training Data:", lr.score(X_train, y_train))
print("R2-Score-Test Data:", lr.score(X_test, y_test))