In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from scipy import stats
from pandas.plotting import scatter_matrix
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix

In [None]:
df = pd.read_csv('data/housing.csv', sep=',', encoding='utf8')
# df.head()
# df.info()

In [None]:
df.hist(bins=30, figsize=(20, 15))
plt.show()

In [None]:
df['housing_median_age'].hist(bins=100, figsize=(10, 8))
plt.show()

In [None]:
# df['ocean_proximity'].value_counts()
# For columns has a string
pd.value_counts(df['ocean_proximity']).plot.bar()

### **Data Description**

In [None]:
df.describe()

### **Visualizing Geographical Data**

In [None]:
df.plot(kind='scatter', x='longitude', y='latitude', alpha=0.1)

### **Regression to Classification**

In [None]:
def value_to_class(value):
    _cls= 0
    if value >= 400000:
        _cls = 1 
    elif value >= 300000:
        _cls = 2
    elif value >= 200000:
        _cls = 3
    elif value >= 100000:
        _cls = 4 
    else:
        _cls = 5
        
    return _cls


In [None]:
df['median_house_class'] = df['median_house_value'].apply(value_to_class)
df['median_house_class']

In [None]:
df['median_house_class'].hist(bins=30, figsize=(10, 5))
plt.show()

### **Missing Values**

In [None]:
mod_df = pd.read_csv('data/housing_missing_value.csv', sep=',', encoding='utf8')
mod_df.head()


In [None]:
mod_df.isna().sum()

In [None]:
mod_df.dropna(axis=0).head()

In [None]:
mod_df.dropna(axis=1, how='any').head()

In [None]:
mod_df['total_rooms'].fillna(1).head()

In [None]:
mod_df.fillna(1)

In [None]:
mod_df['total_bedrooms'] = mod_df['total_bedrooms'].fillna(mod_df['total_bedrooms'].mean())

### **Outliers**

In [None]:
salaries = {'ID':['e1', 'e2', 'e3', 'e4', 'e5'],
            'salary': [1000, 1800, 900, 15100, 3200]
            }
salaries_df = pd.DataFrame(salaries)
salaries_df.head()

In [None]:
salaries_df['salary'].mean()

In [None]:
sns.boxplot(x = df['total_bedrooms'])

In [None]:
print("ALL:", df['total_rooms'].count())
print("Outliers:", df[ (df['total_rooms']>5000) ] ['total_rooms'].count())

In [None]:
clean_df = df[ (df['total_rooms']>5000) ]

In [None]:
fig, ax = plt.subplots(figsize=(10,8))
ax.scatter(df['population'], df['median_house_value'])

ax.set_xlabel('Population')
ax.set_ylabel('Value')
plt.show()

In [None]:
t_df = df[df['population']< 12000]
fig, ax = plt.subplots(figsize=(10,8))
ax.scatter(t_df['population'], t_df['median_house_value'])

ax.set_xlabel('Population')
ax.set_ylabel('Value')
plt.show()

### **Clean data with many features** to solve outliers probleme

In [None]:
clean_df = df[ (df['total_rooms'] < 5000) & df['population'] < 20000]

### **Z-Score**

In [None]:
z = np.abs(stats.zscore(df.drop(['ocean_proximity'], axis=1)))
print("Shape of DataFrame after drop:", df.drop(['ocean_proximity'], axis=1).shape)
# Verify the shape of z
print("Shape of z:", z.shape)

In [None]:
# Find outlier positions where z-scores > 3
_res = np.where(z > 3)
print("Rows with outliers: ", _res[0])
print("Columns with outliers: ", _res[1])


In [None]:
# Access specific element at position (95, 6)
z[95][6]



### **Correlation**

In [None]:
x = np.random.randint(0,100,1000)
y = np.random.randint(0,100,1000)

np.corrcoef(x,y)

In [None]:
plt.scatter(x,y)
plt.show()

In [None]:
x = np.random.randint(0,100,1000)
y = x + np.random.randint(0,100,1000)
np.corrcoef(x,y)
plt.scatter(x,y)
plt.show()

In [None]:
x = np.random.randint(0,100,1000)
y = 100 -x - np.random.randint(0,100,1000)
np.corrcoef(x,y)
plt.scatter(x,y)
plt.show()

In [None]:
# Assuming df is your DataFrame
# Drop non-numeric columns, e.g., 'ocean_proximity'
numeric_df = df.select_dtypes(include=[np.number])

# Calculate the correlation matrix
corr_matrix = numeric_df.corr()

# Display the correlation matrix
print(corr_matrix)


In [None]:
cols = ["median_income", "median_house_value", "total_rooms", "housing_median_age"]
scatter_matrix(df[cols], figsize=(20, 14))

In [None]:
sns.boxenplot(x = df['median_income'])

In [None]:
print(df['median_income'].shape[0])
print(df[(df['median_income'] >= 8 )]['median_income'].shape[0])

In [None]:
df_new = df.drop(columns='ocean_proximity')
clean_df = df_new[(df_new['median_income'] < 8)]
df_new.corr()

In [None]:
clean_df.corr()

In [None]:
cols = ["median_income", "median_house_value", "total_rooms", "housing_median_age"]
scatter_matrix(clean_df[cols], figsize=(20, 14))

In [None]:
plt.figure(figsize=(20,20))
plt.matshow(df_new.corr(), fignum=1)

plt.xticks(range(len(df_new.columns)), df_new.columns)
plt.yticks(range(len(df_new.columns)), df_new.columns)

plt.colorbar()
plt.show()

### **KFold**

In [None]:
data_y = df['median_house_class']
data_x = df.drop(['median_house_class', 'median_house_value'], axis=1)
data_x.shape[0]

In [None]:
from sklearn.model_selection import StratifiedKFold

# Initialize StratifiedKFold with 5 splits, without shuffling
skf = StratifiedKFold(n_splits=5, shuffle=False)

# Loop through each split created by StratifiedKFold
for (train_index, test_index) in skf.split(data_x, data_y):
    
    # Select the training data for the current fold
    data_train_x = data_x.iloc[train_index]
    data_train_y = data_y.iloc[train_index]

    # Select the testing data for the current fold
    data_test_x = data_x.iloc[test_index]
    data_test_y = data_y.iloc[test_index]

    # Print the number of samples in the test set for the current fold
    print(data_test_x.shape[0])
    
    # Print the class distribution in the test set for the current fold
    print(data_test_y.value_counts())
    
    # Print a separator for clarity
    print("\n===========\n")

### **Confusion Matrix**

In [None]:
y_true = [1, 2, 2, 0, 0, 1]
y_pred = [1, 0, 2, 1, 2, 1]
print(confusion_matrix(y_true, y_pred))

In [None]:
y_true = ['degree-1', 'degree-1', 'degree-2', 'degree-3', 'degree-3', 'degree-2']
y_pred = ['degree-2', 'degree-1', 'degree-3', 'degree-1', 'degree-3', 'degree-2']

# For ordring the label
lbls = ['degree-1', 'degree-2', 'degree-3']
print(confusion_matrix( y_true, y_pred, labels=lbls ))

In [None]:
y_true = ['degree-1', 'degree-1', 'degree-2', 'degree-3', 'degree-3', 'degree-2']
y_pred = ['degree-2', 'degree-1', 'degree-3', 'degree-1', 'degree-3', 'degree-2']

lbls = ['degree-1', 'degree-2', 'degree-3']
cm = confusion_matrix( y_true, y_pred, labels=lbls )

fig = plt.figure()
ax = fig.add_subplot(1,1,1)
cax = ax.matshow(cm)

fig.colorbar(cax)
ax.set_xticklabels([''] + lbls)
ax.set_yticklabels([''] + lbls)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

### **Accuracy**

In [None]:
from sklearn.metrics import accuracy_score

y_true = ['degree-1', 'degree-1', 'degree-1', 'degree-3', 'degree-3', 'degree-2']
y_pred = ['degree-2', 'degree-1', 'degree-3', 'degree-3', 'degree-3', 'degree-2']

print(accuracy_score(y_true, y_pred))

In [None]:
from sklearn.metrics import accuracy_score

y_true = ['degree-1', 'degree-1', 'degree-1', 'degree-3', 'degree-3', 'degree-2']
y_pred = ['degree-1', 'degree-1', 'degree-1', 'degree-1', 'degree-1', 'degree-1']

print(accuracy_score(y_true, y_pred))

### **Precision/Recall/F1-Score**

#### **Classification Report**

In [None]:
from sklearn.metrics import classification_report

y_true = ['degree-1', 'degree-1', 'degree-1', 'degree-3', 'degree-3', 'degree-2', 'degree-3', 'degree-1']
y_pred = ['degree-2', 'degree-2', 'degree-1', 'degree-3', 'degree-2', 'degree-1', 'degree-3', 'degree-1']

print(classification_report(y_true, y_pred))

In [None]:
from sklearn.metrics import classification_report

y_true = ['degree-1', 'degree-1', 'degree-1', 'degree-3', 'degree-3', 'degree-2']
y_pred = ['degree-1', 'degree-1', 'degree-1', 'degree-1', 'degree-1', 'degree-1']

print(classification_report(y_true, y_pred))

In [13]:
y_true = [0, 1, 2, 2, 2]
y_pred = [0, 0, 2, 2, 1]
target_names = ['class 0', 'class 1', 'class 2']
print(classification_report(y_true, y_pred, target_names=target_names))



y_pred = [1, 1, 0]
y_true = [1, 1, 1]
print(classification_report(y_true, y_pred, labels=[1, 2, 3]))



              precision    recall  f1-score   support

     class 0       0.50      1.00      0.67         1
     class 1       0.00      0.00      0.00         1
     class 2       1.00      0.67      0.80         3

    accuracy                           0.60         5
   macro avg       0.50      0.56      0.49         5
weighted avg       0.70      0.60      0.61         5

              precision    recall  f1-score   support

           1       1.00      0.67      0.80         3
           2       0.00      0.00      0.00         0
           3       0.00      0.00      0.00         0

   micro avg       1.00      0.67      0.80         3
   macro avg       0.33      0.22      0.27         3
weighted avg       1.00      0.67      0.80         3



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
