In [1]:
import pandas as pd
file = 'diamonds.csv'
df = pd.read_csv(file)
df.head()

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [2]:
data = df.drop(['Unnamed: 0', 'x', 'y', 'z', 'table', 'depth'], axis=1)
data['cut'] = data['cut'].map({'Ideal': 5, 'Premium': 4, 'Very Good': 3, 'Good': 2, 'Fair': 1})
data['color'] = data['color'].map({'D': 7, 'E': 6, 'F': 5, 'G': 4, 'H': 3, 'I': 2, 'J': 1})
data['clarity'] = data['clarity'].map({'IF': 8, 'VVS1': 7, 'VVS2': 6, 'VS1': 5, 'VS2': 4, 'SI1': 3, 'SI2': 2, 'I':1})
data.dropna(inplace=True)
target = data['price']
data = data.drop(['price'], axis=1)
feature_names = data.columns
data.head()

Unnamed: 0,carat,cut,color,clarity
0,0.23,5,6,2.0
1,0.21,4,6,3.0
2,0.23,2,6,5.0
3,0.29,4,2,4.0
4,0.31,2,1,2.0


In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

In [4]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [5]:
from sklearn.metrics import mean_squared_error

predictions = model.predict(X_test)
model.fit(X_train, y_train)
MSE = mean_squared_error(y_test, predictions)
r2 = model.score(X_test, y_test)

print(f"MSE: {MSE}, R2: {r2}")

MSE: 1338093.8886915538, R2: 0.9163940495278329


In [6]:
model.coef_

array([8931.57333227,  141.58746381,  318.04464711,  500.83697969])

In [7]:
model.predict(X_test)

array([-1710.74168751,  8227.04961583,  9696.48971724, ...,
        2567.08757884,  7790.96696442,  7132.68599351])

In [8]:
test_df = pd.DataFrame(
    {"carat": ["0.37", "0.47", "0.51", "0.71", "0.9", "1.04"],
     "cut": ["Good", "Good", "Very Good", "Very Good", "Very Good", "Very Good"],
     "color": ["E", "H", "E", "I", "I", "D"],
     "clarity": ["I", "I", "SI2", "SI2", "VVS1", "VS2"],
     "price": ["368", "982", "1715", "2650", "7333", "13039"]
     }
)
test_df.head()

Unnamed: 0,carat,cut,color,clarity,price
0,0.37,Good,E,I,368
1,0.47,Good,H,I,982
2,0.51,Very Good,E,SI2,1715
3,0.71,Very Good,I,SI2,2650
4,0.9,Very Good,I,VVS1,7333


In [9]:
test_df['cut'] = test_df['cut'].map({'Ideal': 5, 'Premium': 4, 'Very Good': 3, 'Good': 2, 'Fair': 1})
test_df['color'] = test_df['color'].map({'D': 7, 'E': 6, 'F': 5, 'G': 4, 'H': 3, 'I': 2, 'J': 1})
test_df['clarity'] = test_df['clarity'].map({'IF': 8, 'VVS1': 7, 'VVS2': 6, 'VS1': 5, 'VS2': 4, 'SI1': 3, 'SI2': 2, 'I':1})
actuals = test_df['price']
test_df = test_df.drop(['price'], axis=1)
test_df.head()

Unnamed: 0,carat,cut,color,clarity
0,0.37,2,6,1
1,0.47,2,3,1
2,0.51,3,6,2
3,0.71,3,2,2
4,0.9,3,2,7


In [10]:
predictions = model.predict(test_df)

In [11]:
pd.DataFrame({"Prediction": predictions, "Actual": actuals}).reset_index(drop=True)

Unnamed: 0,Prediction,Actual
0,-1146.312717,368
1,-1207.289325,982
2,746.531993,1715
3,1260.668071,2650
4,5461.851902,7333
5,6799.984465,13039


In [12]:
MSE = mean_squared_error(actuals, predictions)
r2 = model.score(test_df, actuals)

print(f"MSE: {MSE}, R2: {r2}")

MSE: 8730135.715696106, R2: 0.5686335026141528
