<a href="https://colab.research.google.com/github/nissi00/machine-learning/blob/main/classificateurs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install ucimlrepo



In [None]:
import pandas as pd
from ucimlrepo import fetch_ucirepo

# Récupérer le dataset "adult"
adult = fetch_ucirepo(id=2)

# Données (features) et cibles (targets)
X = adult.data.features
y = adult.data.targets

# Concaténer les données et les cibles en un seul DataFrame
# Renommer la colonne cible pour une meilleure compréhension
y = y.rename(columns={y.columns[0]: "income"}) # Use a dictionary to map the old column name to 'target'
data = pd.concat([X, y], axis=1)
data["income"] = data["income"].replace({">50K.": ">50K", "<=50K.":"<=50K"})

# Sauvegarder le DataFrame concaténé en un fichier CSV
data.to_csv("adult_data.csv", index=False)

# Afficher quelques informations sur le dataset pour vérification
# print(adult.metadata)
# print(adult.variables)
data=pd.read_csv("adult_data.csv")
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [None]:
y.head()

Unnamed: 0,income
0,<=50K
1,<=50K
2,<=50K
3,<=50K
4,<=50K


In [None]:
data.isnull().sum()

Unnamed: 0,0
age,0
workclass,963
fnlwgt,0
education,0
education-num,0
marital-status,0
occupation,966
relationship,0
race,0
sex,0


In [None]:
data.shape

(48842, 15)

In [None]:
for column in data.columns:
    if data[column].isnull().sum() > 0:
        if data[column].dtype in ['float64', 'int64']:
            data[column].fillna(data[column].median(), inplace=True)
        else:
            data[column].fillna(data[column].mode()[0], inplace=True)
# To remove the NA values

In [None]:
#Get list of categorical variables
s = (data.dtypes == 'object')
object_cols = list(s[s].index)

print("Categorical variables in the dataset:", object_cols)

Categorical variables in the dataset: ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'income']


In [None]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [None]:
encoders = {}
for col in object_cols:
    LE = LabelEncoder()
    data[col] = LE.fit_transform(data[col])
    encoders[col] = LE  # Stocker l'instance de LabelEncode

print("All features are now numerical")

All features are now numerical


In [None]:
ds = data.copy()

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt

In [None]:
# Select features and target variable
features = data.drop(['income'], axis=1)
target = data['income']

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn import metrics

In [None]:
# Building pipelins of standard scaler and model for varios regressors.

pipeline_lr=Pipeline([("scalar1",StandardScaler()),
                     ("lr_classifier",LinearRegression())])

pipeline_dt=Pipeline([("scalar2",StandardScaler()),
                     ("dt_classifier",DecisionTreeRegressor())])

pipeline_rf=Pipeline([("scalar3",StandardScaler()),
                     ("rf_classifier",RandomForestRegressor())])


pipeline_kn=Pipeline([("scalar4",StandardScaler()),
                     ("kn_classifier",KNeighborsRegressor())])


pipeline_xgb=Pipeline([("scalar5",StandardScaler()),
                     ("xgb_classifier",XGBRegressor())])


# List of all the pipelines
pipelines = [pipeline_lr, pipeline_dt, pipeline_rf, pipeline_kn, pipeline_xgb]

# Dictionary of pipelines and model types for ease of reference
pipe_dict = {0: "LinearRegression", 1: "DecisionTree", 2: "RandomForest",3: "KNeighbors", 4: "XGBRegressor"}

# Fit the pipelines
for pipe in pipelines:
    pipe.fit(X_train, y_train)

In [None]:
cv_results_rms = []
for i, model in enumerate(pipelines):
    cv_score = cross_val_score(model, X_train,y_train,scoring="neg_root_mean_squared_error", cv=10)
    cv_results_rms.append(cv_score)
    print("%s: %f " % (pipe_dict[i], cv_score.mean()))

LinearRegression: -0.367054 
DecisionTree: -0.430317 
RandomForest: -0.316717 
KNeighbors: -0.349980 
XGBRegressor: -0.302117 


In [None]:
# Model prediction on test data
pred = pipeline_xgb.predict(X_test)
# Model Evaluation
print("R^2:",metrics.r2_score(y_test, pred))
print("Adjusted R^2:",1 - (1-metrics.r2_score(y_test, pred))*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1))
print("MAE:",metrics.mean_absolute_error(y_test, pred))
print("MSE:",metrics.mean_squared_error(y_test, pred))
print("RMSE:",np.sqrt(metrics.mean_squared_error(y_test, pred)))

R^2: 0.49625234012932484
Adjusted R^2: 0.49552930678524143
MAE: 0.19191321817746812
MSE: 0.0921629390052834
RMSE: 0.3035834959369224


In [None]:
# Import DecisionTreeClassifier from sklearn.tree
from sklearn.tree import DecisionTreeClassifier

# Build a decision tree classifier
decision_tree = DecisionTreeClassifier()
decision_tree = decision_tree.fit(X_train, y_train)

In [None]:
rfc = RandomForestClassifier()
rfc = rfc.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Make predictions on the test set using the decision tree classifier
y_pred_rfc = rfc.predict(X_test)

# Calculate performance metrics for the decision tree classifier
accuracy_rfc = accuracy_score(y_test, y_pred_rfc)
precision_rfc = precision_score(y_test, y_pred_rfc, average='weighted')
recall_rfc = recall_score(y_test, y_pred_rfc, average='weighted')
f1_rfc = f1_score(y_test, y_pred_rfc, average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred_rfc)

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Make predictions on the test set using the decision tree classifier
y_pred_dt = decision_tree.predict(X_test)

# Calculate performance metrics for the decision tree classifier
accuracy_dt = accuracy_score(y_test, y_pred_dt)
precision_dt = precision_score(y_test, y_pred_dt, average='weighted')
recall_dt = recall_score(y_test, y_pred_dt, average='weighted')
f1_dt = f1_score(y_test, y_pred_dt, average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred_dt)


In [None]:
# Print the performance metrics
print('Decision Tree Classifier:')
print('Accuracy:', accuracy_dt)
print('Precision:', precision_dt)
print('Recall:', recall_dt)
print('F1 Score:', f1_dt)
print()

Decision Tree Classifier:
Accuracy: 0.808475790766711
Precision: 0.8091190488936076
Recall: 0.808475790766711
F1 Score: 0.8087923772139216



In [None]:
# Print the performance metrics
print('random forest Classifier:')
print('Accuracy:', accuracy_rfc)
print('Precision:', precision_rfc)
print('Recall:', recall_rfc)
print('F1 Score:', f1_rfc)
print()

random forest Classifier:
Accuracy: 0.8530044016787798
Precision: 0.8467955191891506
Recall: 0.8530044016787798
F1 Score: 0.8478050168458307



In [None]:
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor

# Initialiser le StandardScaler
scaler = StandardScaler()

# Appliquer le StandardScaler sur les données d'entraînement
X_train_scaled = scaler.fit_transform(X_train)

# Appliquer le StandardScaler sur les données de test
X_test_scaled = scaler.transform(X_test)

# Initialiser le modèle XGBRegressor
model = XGBRegressor()

# Entraîner le modèle
model.fit(X_train_scaled, y_train)

# Faire des prédictions sur les données de test
pred = model.predict(X_test_scaled)


In [None]:
from sklearn import metrics
import numpy as np

# Définir un seuil pour la classification

threshold= 0.5
# Convertir les prédictions en classes binaires en utilisant le seuil
pred_class = (pred > threshold).astype(int)
y_test_class = (y_test > threshold).astype(int)

# Calculer les métriques de classification
accuracy = metrics.accuracy_score(y_test_class, pred_class)
precision = metrics.precision_score(y_test_class, pred_class)
recall = metrics.recall_score(y_test_class, pred_class)
f1 = metrics.f1_score(y_test_class, pred_class)

# Afficher les résultats
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")


Accuracy: 0.8708
Precision: 0.7887
Recall: 0.6340
F1-Score: 0.7029
