#  regression evaluating metrics

In [47]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing

In [10]:
housing_df=fetch_california_housing()

In [11]:
housing_df

{'data': array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
           37.88      , -122.23      ],
        [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
           37.86      , -122.22      ],
        [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
           37.85      , -122.24      ],
        ...,
        [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
           39.43      , -121.22      ],
        [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
           39.43      , -121.32      ],
        [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
           39.37      , -121.24      ]]),
 'target': array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894]),
 'frame': None,
 'target_names': ['MedHouseVal'],
 'feature_names': ['MedInc',
  'HouseAge',
  'AveRooms',
  'AveBedrms',
  'Population',
  'AveOccup',
  'Latitude',
  'Longitude'],
 'DESCR': '.. _california_housing_dataset:\n

In [12]:
house_df=pd.DataFrame(columns=housing_df['feature_names'],data=housing_df['data'])

In [16]:
house_df['target']=housing_df['target']

In [17]:
house_df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [21]:
from sklearn.ensemble import RandomForestRegressor 
from sklearn.model_selection import train_test_split

In [18]:
x=house_df.drop(["target"],axis=1)
y=house_df['target']

In [22]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

In [23]:
x_train.shape,x_test.shape

((16512, 8), (4128, 8))

In [26]:
model=RandomForestRegressor()

In [27]:
model.fit(x_train,y_train)

In [28]:
model.score(x_test,y_test)

0.8129775380790634

In [30]:
from sklearn.metrics import r2_score

In [31]:
y_pred=model.predict(x_test)

In [32]:
score=r2_score(y_test,y_pred)

In [33]:
score

0.8129775380790634

# using MAE or mean absolute error

In [36]:
from sklearn.metrics import mean_absolute_error

In [37]:
mae=mean_absolute_error(y_test,y_pred)

In [38]:
mae

0.33143097252906994

In [44]:
df=pd.DataFrame(data={"actual_values":y_test,
                     "predicted_values":y_pred})
df["differences"]=df["actual_values"]-df["predicted_values"]

df.head()    #the results are shuffled because we shuffled our data during the spliting of data

Unnamed: 0,actual_values,predicted_values,differences
13570,0.973,2.48351,-1.51051
12935,2.161,1.65459,0.50641
13978,1.286,2.160791,-0.874791
8188,5.00001,3.42173,1.57828
5150,0.909,0.99756,-0.08856


In [45]:
df["differences"].mean()  #its different because the MEA includes the absolute while here we only checked for mean

0.0013650943798454327

In [49]:
np.abs(df["differences"]).mean()   #and vola its the same as MAE

0.33143097252906994

# lets check out the mean squared error

In [50]:
from sklearn.metrics import mean_squared_error

In [53]:
mse=mean_squared_error(y_test,y_pred)
mse

0.2615678199801818

In [56]:
df["squared_differences"]=np.square(df["differences"])
df.head()

Unnamed: 0,actual_values,predicted_values,differences,squared_differences
13570,0.973,2.48351,-1.51051,2.281642
12935,2.161,1.65459,0.50641,0.256451
13978,1.286,2.160791,-0.874791,0.765259
8188,5.00001,3.42173,1.57828,2.490966
5150,0.909,0.99756,-0.08856,0.007843


In [62]:
df_error=df.copy()
df_error.iloc[1:100]["differences"]=12

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_error.iloc[1:100]["differences"]=12


In [63]:
df_error.head()

Unnamed: 0,actual_values,predicted_values,differences,squared_differences
13570,0.973,2.48351,-1.51051,2.281642
12935,2.161,1.65459,0.50641,0.256451
13978,1.286,2.160791,-0.874791,0.765259
8188,5.00001,3.42173,1.57828,2.490966
5150,0.909,0.99756,-0.08856,0.007843


In [65]:
heart_ds=pd.read_csv("C:/Users/dread-miles/Documents/Data Sets/BootCamp/heart-disease.csv")

# using scoring method

In [66]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
np.random.seed(42)

In [67]:
heart_ds.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [68]:
x=heart_ds.drop(["target"],axis=1)
y=heart_ds["target"]

In [69]:
clf=RandomForestClassifier()

In [None]:
clf.fit()

In [72]:
np.random.seed(42)

cv_cros=cross_val_score(clf,x,y,scoring=None) #scoring none means that the evaluation is accuracy

In [73]:
cv_cros

array([0.81967213, 0.90163934, 0.83606557, 0.78333333, 0.78333333])

In [76]:
# cross-validation accuracy
print(f"the cross validation accuracy is {np.mean(cv_cros)*100:.2f}%")

the cross validation accuracy is 82.48%


In [87]:
np.random.seed(42)
# checking for precession
cv_precession=cross_val_score(clf,x,y,cv=5,scoring="precision")

In [88]:
cv_precession

array([0.82352941, 0.93548387, 0.84848485, 0.79411765, 0.76315789])

In [89]:
cv_precession.mean()

0.8329547346025924

In [90]:
#this is cross validated precision
print(f"the precision is:{np.mean(cv_precession)*100:.2f}%")

the precision is:83.30%


In [85]:
np.random.seed(42)
# recall
cv_recall=cross_val_score(clf,x,y,scoring="recall")
cv_recall

array([0.84848485, 0.87878788, 0.84848485, 0.81818182, 0.87878788])

In [86]:
#this is cross validated recall
print(f"the precision is:{np.mean(cv_recall)*100:.2f}%")

the precision is:85.45%


# lets check out cross validation score for regression purposes

In [98]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

In [96]:
rgr=RandomForestRegressor()

In [101]:
rgr

In [99]:
x=house_df.drop(["target"],axis=1)
y=house_df['target']

In [102]:
cross_val=cross_val_score(rgr,x,y,cv=3,scoring=None)

In [103]:
cross_val

array([0.6187768 , 0.72471525, 0.62951553])

In [104]:
np.mean(cross_val)

0.657669193849984

In [114]:
#lets check out the mae or the neg mean squared error
cross_mse=cross_val_score(rgr,x,y,cv=5,scoring="neg_mean_squared_error")

In [115]:
np.mean(cross_mse)

-0.4313044696219627

In [116]:
# cross validation using means squared error
mae_cv=cross_val_score(rgr,x,y,cv=5,scoring="neg_mean_absolute_error")

In [117]:
np.mean(mae_cv)

-0.4661166586191861

# using different evaluation metrics as scikit learn functions

In [119]:
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [120]:
clf=RandomForestClassifier()

In [121]:
x=heart_ds.drop(["target"],axis=1)
y=heart_ds["target"]

In [122]:
x_train,x_test,y_train,y_test=train_test_splitplit(x,y,test_size=0.2)

In [124]:
clf.fit(x_train,y_train);

In [125]:
y_pred=clf.predict(x_test)

In [134]:
print(f"accuray score is {accuracy_score(y_test,y_pred)*100 :.2f}%")
print(f"precission score is {precision_score(y_test,y_pred)*100 :.2f}%")
print(f"recall score is {recall_score(y_test,y_pred)*100 :.2f}%")
print(f"f1 score is {f1_score(y_test,y_pred)*100 :.2f}%")

accuray score is 81.97%
precission score is 78.79%
recall score is 86.67%
f1 score is 82.54%


In [137]:
from sklearn.metrics import mean_absolute_error,r2_score,mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

x=house_df.drop(["target"],axis=1)
y=house_df['target']

x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=42,test_size=0.2)

model=RandomForestRegressor()

model.fit(x_train,y_train)

y_pred=model.predict(x_test)


In [138]:
print(f"the mean absolute error {mean_absolute_error(y_test,y_pred):.2f}")
print(f"the r2 scored error {r2_score(y_test,y_pred):.2f}")
print(f"the mean squared error {mean_squared_error(y_test,y_pred):.2f}")

the mean absolute error 0.33
the r2 scored error 0.81
the mean squared error 0.25


25