In [None]:
import seaborn as sns
import pandas as pd
data = sns.load_dataset('diamonds')
data.head()


2. Display the correlation data for the price column.

In [None]:
data.corr()[['price']].sort_values(by='price',ascending=False)

3. Create test and training datasets using the carat, table, and depth columns as the independent 
variables and the price as the dependent variable. (The x, y, and z columns contain information 
that’s related to the table and depth columns, so it’s not necessary to use those columns.) The test 
dataset should consist of 30% of the total dataset, and you should specify a value for the 
random_state parameter

In [None]:
from sklearn.model_selection import train_test_split
X = data[['carat', 'table', 'depth']]  # Independent variables
y = data['price']  # Dependent variable (target)
# 3. Split the data into training and testing datasets
# Use 30% of the data for testing and specify a random_state value for reproducibility
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)


3.Create and fit the model.


In [None]:
from sklearn.linear_model import LinearRegression
model=LinearRegression()
model.fit(X_train,y_train)

5. Score the model with the test dataset.


In [None]:
model.score(X_test,y_test)

6. Score the model with the training dataset.

In [None]:
model.score(X_train,y_train)

7. Use the model to make predictions about the test data, and store the results in a DataFrame.

In [None]:
y_predicted=model.predict(X_test)

8. Create a DataFrame that contains the columns used to make predictions, along with the actual price 
and the predicted price. Then, display the first five rows of data to see how close the predicted 
prices are.

In [None]:
predicted=pd.DataFrame(y_predicted,columns=['predictedPrice'])
final=predicted.join([X_test.reset_index(drop=True),y_test.reset_index(drop=True)])
final.head()

9. Calculate the residuals for the regression, and store the results in a new column in the DataFrame 
you created in step 8. Then, display the first five rows of data to see the residual values.

In [None]:
final['residual']=final.price - final.predictedPrice
final.head()

10.Plot the residuals using a Seaborn KDE plot.

In [None]:
sns.kdeplot(data=final,x='residual')

PART 2

1. Drop the x, y, and z columns, since they won’t be used to make predictions.


In [None]:
data=data.drop(columns=['x','y','z'])

2. Use the info() method to display the data types for the columns

In [None]:
data.info()

3. Convert the three columns with categorical data into dummy variables and store the results in a 
new DataFrame.

In [None]:
catCols=['cut','color','clarity']
dummies=pd.get_dummies(data[catCols])
dummies.info()

4. Drop the categorical columns from the original DataFrame and join the DataFrame with the dummy 
variables to it. Store the result in a DataFrame named dataDummies, and then use the info() method 
to display the resulting columns.

In [None]:
dataDummies=data.drop(columns=catCols)
dataDummies=dataDummies.join(dummies)
dataDummies.info()

5. Rescale the data in the numeric columns, and then display the rescaled data

In [None]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
numCol=['carat', 'depth', 'table', 'price']
dataDummies[numCol]=scaler.fit_transform(dataDummies[numCol])
dataDummies.head()


6. Display the correlation data for the price column.

In [None]:
dataDummies.corr()[['price']].sort_values(by='price',ascending=False)


7. Create test and training datasets using the five columns with the highest correlation. The test dataset 
should consist of 30% of the total dataset, and you should specify a value for the random_state 
parameter.

In [None]:




dataTrain,dataTest = train_test_split(dataDummies, test_size=0.30, random_state=42)


8. Create and fit the model.

In [None]:
model=LinearRegression()
xCols=['carat','clarity_SI2','table','color_I','cut_Premium']
model.fit(dataTrain[xCols],dataTrain['price'])    



9. Score the model with the test dataset.

In [None]:
model.score(dataTest[xCols],dataTest['price'])

10.Score the model with the training dataset

In [None]:
model.score(dataTrain[xCols],dataTrain['price'])

11.Use the model to make predictions about the test data, and store the results in a DataFrame.

In [None]:
y_predicted=model.predict(dataTest[xCols])


12.Create a DataFrame that contains the columns used to make predictions, along with the actual price 
and the predicted price. Then, display the first five rows of data to see how close the predicted 
prices are.

In [None]:
predicted=pd.DataFrame(y_predicted,columns=['predictedPrice'])
final=predicted.join([dataTest[xCols].reset_index(drop=True),dataTest['price'].reset_index(drop=True)])
final[['carat','clarity_SI2','table','color_I','cut_Premium','price','predictedPrice']].head()

13.Calculate the residuals for the regression, and store the results in a new column in the DataFrame 
you created in step 14. Then, display the first five rows of data to see the residual values.

In [None]:
final['residual']=final.price - final.predictedPrice
final.head()

14.Plot the residuals using a Seaborn KDE plot.

In [None]:
sns.kdeplot(data=final,x='residual')

15.Use a for loop with the feature selection model to test different numbers of features. Be sure not to 
include the price column in the list of independent variables. Note that it may take a few minutes 
for this code to run.

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_regression

model = LinearRegression()
testScores = []
trainScores = []

for i in range(1, len(dataTrain.columns)):
    fs = SelectKBest(score_func=mutual_info_regression, k=i)
    fs.fit(dataTrain.drop(columns=['price']), dataTrain['price'])

x_train_fs = fs.transform(dataTrain.drop(columns=['price']))                                                                                                                                                                                                                          
x_test_fs = fs.transform(dataTest.drop(columns=['price']))

model.fit(x_train_fs, dataTrain['price'])
    
testScore = model.score(x_test_fs, dataTest['price'])
trainScore = model.score(x_train_fs, dataTrain['price'])
testScores.append(testScore)
trainScores.append(trainScore)













In [None]:
data = pd.DataFrame(data={'testScores':testScores,
                        'trainScores':trainScores})
data.reset_index(inplace=True)
data.rename(columns={'index':'numFeatures'}, inplace=True)
data.numFeatures = data.numFeatures + 1
data.plot(x='numFeatures', y=['testScores','trainScores'])
