importations

In [19]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline  

def prepare_and_train_model(df):
    """
    Prepare data and train an improved model for development trend score prediction
    """
    X = df.drop('development_trend_score', axis=1)
    y = df['development_trend_score']
    
    X = pd.get_dummies(X, columns=['urban_area_type'])
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    pipeline = Pipeline([
        ('imputer', KNNImputer(n_neighbors=5)),
        ('scaler', StandardScaler()),
        ('smote', SMOTE(random_state=42)),
        ('classifier', RandomForestClassifier(
            n_estimators=200,
            max_depth=15,
            min_samples_split=5,
            min_samples_leaf=2,
            class_weight='balanced',
            random_state=42
        ))
    ])
    
    # Fit the pipeline
    pipeline.fit(X_train, y_train)
    
    # Make predictions
    y_pred = pipeline.predict(X_test)
    
    # Print classification report
    
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    print(f"Test Accuracy: {accuracy_score(y_test, y_pred)}")

    
    feature_importance = pd.DataFrame({
        'feature': X_train.columns,
        'importance': pipeline.named_steps['classifier'].feature_importances_
    })
    feature_importance = feature_importance.sort_values('importance', ascending=False)
    
    print("\nTop 10 Most Important Features:")
    print(feature_importance.head(10))
    
    return pipeline, feature_importance

def evaluate_with_cross_validation(X, y, pipeline):
    """
    Evaluate the model using cross-validation
    """
    scores = cross_val_score(pipeline, X, y, cv=5, scoring='accuracy')
    print(f"\nCross-validation scores: {scores}")
    print(f"Average CV score: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")

# Example usage:
# df = pd.read_csv('your_data.csv')
# pipeline, feature_importance = prepare_and_train_model(df)
# X = df.drop('development_trend_score', axis=1)
# y = df['development_trend_score']
# evaluate_with_cross_validation(X, y, pipeline)

In [20]:
df = pd.read_csv('./data/urban_development_dataset.csv')
pipeline, feature_importance = prepare_and_train_model(df)
X = df.drop('development_trend_score', axis=1)
y = df['development_trend_score']
evaluate_with_cross_validation(X, y, pipeline)


Classification Report:
              precision    recall  f1-score   support

           1       0.12      0.06      0.08       204
           2       0.23      0.11      0.15       403
           3       0.40      0.74      0.52       804
           4       0.23      0.09      0.13       390
           5       0.07      0.03      0.04       199

    accuracy                           0.35      2000
   macro avg       0.21      0.21      0.18      2000
weighted avg       0.27      0.35      0.28      2000

Test Accuracy: 0.3455

Top 10 Most Important Features:
                          feature  importance
15          proximity_to_highways    0.053081
8                  retail_density    0.051246
10         green_space_percentage    0.051167
9       office_space_availability    0.050872
17            housing_price_index    0.050817
7               unemployment_rate    0.050535
6   public_transport_availability    0.050358
16                 internet_speed    0.049841
13            crim

ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "/home/self-ouss/notebooks/.venv/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/self-ouss/notebooks/.venv/lib/python3.12/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/self-ouss/notebooks/.venv/lib/python3.12/site-packages/imblearn/pipeline.py", line 329, in fit
    Xt, yt = self._fit(X, y, routed_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/self-ouss/notebooks/.venv/lib/python3.12/site-packages/imblearn/pipeline.py", line 255, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/self-ouss/notebooks/.venv/lib/python3.12/site-packages/joblib/memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/self-ouss/notebooks/.venv/lib/python3.12/site-packages/imblearn/pipeline.py", line 1104, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/self-ouss/notebooks/.venv/lib/python3.12/site-packages/sklearn/utils/_set_output.py", line 316, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/self-ouss/notebooks/.venv/lib/python3.12/site-packages/sklearn/base.py", line 1101, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/self-ouss/notebooks/.venv/lib/python3.12/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/self-ouss/notebooks/.venv/lib/python3.12/site-packages/sklearn/impute/_knn.py", line 230, in fit
    X = self._validate_data(
        ^^^^^^^^^^^^^^^^^^^^
  File "/home/self-ouss/notebooks/.venv/lib/python3.12/site-packages/sklearn/base.py", line 633, in _validate_data
    out = check_array(X, input_name="X", **check_params)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/self-ouss/notebooks/.venv/lib/python3.12/site-packages/sklearn/utils/validation.py", line 1012, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/self-ouss/notebooks/.venv/lib/python3.12/site-packages/sklearn/utils/_array_api.py", line 745, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/self-ouss/notebooks/.venv/lib/python3.12/site-packages/pandas/core/generic.py", line 2153, in __array__
    arr = np.asarray(values, dtype=dtype)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: could not convert string to float: 'Mixed'

--------------------------------------------------------------------------------
4 fits failed with the following error:
Traceback (most recent call last):
  File "/home/self-ouss/notebooks/.venv/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/self-ouss/notebooks/.venv/lib/python3.12/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/self-ouss/notebooks/.venv/lib/python3.12/site-packages/imblearn/pipeline.py", line 329, in fit
    Xt, yt = self._fit(X, y, routed_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/self-ouss/notebooks/.venv/lib/python3.12/site-packages/imblearn/pipeline.py", line 255, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/self-ouss/notebooks/.venv/lib/python3.12/site-packages/joblib/memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/self-ouss/notebooks/.venv/lib/python3.12/site-packages/imblearn/pipeline.py", line 1104, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/self-ouss/notebooks/.venv/lib/python3.12/site-packages/sklearn/utils/_set_output.py", line 316, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/self-ouss/notebooks/.venv/lib/python3.12/site-packages/sklearn/base.py", line 1101, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/self-ouss/notebooks/.venv/lib/python3.12/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/self-ouss/notebooks/.venv/lib/python3.12/site-packages/sklearn/impute/_knn.py", line 230, in fit
    X = self._validate_data(
        ^^^^^^^^^^^^^^^^^^^^
  File "/home/self-ouss/notebooks/.venv/lib/python3.12/site-packages/sklearn/base.py", line 633, in _validate_data
    out = check_array(X, input_name="X", **check_params)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/self-ouss/notebooks/.venv/lib/python3.12/site-packages/sklearn/utils/validation.py", line 1012, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/self-ouss/notebooks/.venv/lib/python3.12/site-packages/sklearn/utils/_array_api.py", line 745, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/self-ouss/notebooks/.venv/lib/python3.12/site-packages/pandas/core/generic.py", line 2153, in __array__
    arr = np.asarray(values, dtype=dtype)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: could not convert string to float: 'Industrial'


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.multiclass      import OneVsOneClassifier
from sklearn.utils import resample

data preparation

In [7]:
import pandas as pd
from sklearn.utils import resample
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv('./data/urban_development_dataset.csv')
# resampled_data = []
# classes = df['development_trend_score'].unique()
# target_samples = 2000
# for cls in classes:
#     class_data = df[df['development_trend_score'] == cls]
#     if len(class_data) > target_samples:
#         resampled = resample(class_data, replace=False, n_samples=target_samples, random_state=42)
#     else:
#         resampled = resample(class_data, replace=True, n_samples=target_samples, random_state=42)
#     resampled_data.append(resampled)

# balanced_df = pd.concat(resampled_data)
# df = balanced_df
print("Répartition initiale des classes :")
print(df['development_trend_score'].value_counts())

test_data = pd.read_csv('./data/urban_development_test_data.csv')
common_columns = df.columns.intersection(test_data.columns)
#definition of the X and Y
Y = df['development_trend_score']
# Y -= 1
X = df[common_columns]


test_data = test_data[common_columns]
le = LabelEncoder()
df.head()


Répartition initiale des classes :
development_trend_score
3    4019
2    2018
4    1949
1    1018
5     996
Name: count, dtype: int64


Unnamed: 0,population_density,median_age,average_household_income,number_of_schools,number_of_hospitals,number_of_parks,public_transport_availability,unemployment_rate,retail_density,office_space_availability,...,air_quality_index,crime_rate_per_1000,fire_station_proximity,proximity_to_highways,internet_speed,housing_price_index,year,road_density,urban_area_type,development_trend_score
0,5993.428306,28.215053,61965.724953,5.0,2.0,3.0,,7.861705,37.189497,88712.383725,...,62.395034,41.80296,6.402334,17.249894,77.988714,90.010909,2020.0,55.74554,Industrial,4
1,4723.471398,,,4.0,0.0,3.0,,5.083094,46.340402,3626.311603,...,34.313537,39.794477,3.560183,12.198698,46.044637,100.535265,2021.0,44.268702,Mixed,4
2,6295.377076,29.026189,36269.603084,,2.0,0.0,0.247585,4.089341,31.838015,49082.549371,...,35.43075,49.022339,5.456112,16.765462,34.875393,136.993995,2020.0,73.174815,Mixed,5
3,8046.059713,36.10418,66591.684435,5.0,2.0,0.0,0.993784,,64.489689,30025.861163,...,46.964789,30.998953,9.443431,18.425252,71.182735,89.952283,2019.0,79.5166,Industrial,2
4,4531.693251,46.971785,25198.346488,6.0,0.0,0.0,0.838243,6.676002,48.426125,,...,67.517386,39.880218,8.398393,15.101962,108.795957,64.845688,2018.0,47.145522,Industrial,1


Preprocessing of Null data


In [9]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline

X_train, X_test, y_train, y_test = train_test_split(
        X, Y, test_size=0.2, random_state=42, stratify=Y
    )

pipeline = Pipeline([
        ('imputer', KNNImputer(n_neighbors=5)),
        ('scaler', StandardScaler()),
        ('smote', SMOTE(random_state=42)),
        ('classifier', RandomForestClassifier(
            n_estimators=200,
            max_depth=15,
            min_samples_split=5,
            min_samples_leaf=2,
            class_weight='balanced',
            random_state=42
        ))
    ])

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Feature importance analysis
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': pipeline.named_steps['classifier'].feature_importances_
})
feature_importance = feature_importance.sort_values('importance', ascending=False)

print("\nTop 10 Most Important Features:")
print(feature_importance.head(10))

TypeError: All intermediate steps should be transformers and implement fit and transform or be the string 'passthrough' 'SMOTE(random_state=42)' (type <class 'imblearn.over_sampling._smote.base.SMOTE'>) doesn't

In [None]:
scores = cross_val_score(pipeline, X, Y, cv=5, scoring='accuracy')
print(f"\nCross-validation scores: {scores}")
print(f"Average CV score: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")

In [5]:
from sklearn.impute import KNNImputer
numerical_features = [
    "population_density", 'median_age', 'average_household_income', 
    'number_of_schools', 'number_of_hospitals', 'number_of_parks', 
    'public_transport_availability', 'unemployment_rate', 'retail_density', 
    'office_space_availability', 'green_space_percentage', 'average_temperature', 
    'air_quality_index', 'crime_rate_per_1000', 'fire_station_proximity', 
    'proximity_to_highways', 'internet_speed', 'road_density', 
    'housing_price_index', 'year'
]
# imputer = SimpleImputer(strategy='median') 
# numerical_features = ["population_density",'median_age', 'average_household_income', 'number_of_schools', 'number_of_hospitals','number_of_parks','public_transport_availability' ,'unemployment_rate', 'retail_density','office_space_availability','green_space_percentage','average_temperature','air_quality_index','crime_rate_per_1000','fire_station_proximity','proximity_to_highways','internet_speed','road_density','housing_price_index','year',  ]
# df[numerical_features] = imputer.fit_transform(df[numerical_features])
# df.isnull().sum()

knn_imputer = KNNImputer(n_neighbors=5)  
X[numerical_features] = knn_imputer.fit_transform(X[numerical_features])
test_data[numerical_features] = knn_imputer.fit_transform(test_data[numerical_features])
print(X.isnull().sum())


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[numerical_features] = knn_imputer.fit_transform(X[numerical_features])


population_density               0
median_age                       0
average_household_income         0
number_of_schools                0
number_of_hospitals              0
number_of_parks                  0
public_transport_availability    0
unemployment_rate                0
retail_density                   0
office_space_availability        0
green_space_percentage           0
average_temperature              0
air_quality_index                0
crime_rate_per_1000              0
fire_station_proximity           0
proximity_to_highways            0
internet_speed                   0
housing_price_index              0
year                             0
road_density                     0
dtype: int64


normalization of data

In [26]:
from tensorflow.keras.utils import to_categorical
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns, index=X.index)
# Y = le.fit_transform(Y)
# Y = to_categorical(Y)
scaled_data = scaler.transform(test_data)
test_data_scaled = pd.DataFrame(scaled_data, columns=test_data.columns, index=test_data.index)



check outliers

In [27]:
from scipy.stats import zscore
z_scores = np.abs(zscore(X_scaled)) 
outlier_mask = (z_scores > 3).any(axis=1)
plt.figure(figsize=(10, 6))
print(z_scores)

      population_density  median_age  average_household_income  \
5846            3.005547    0.709511                  0.467174   
7544            1.098085    1.672421                  0.394031   
4545            0.614504    1.677495                  0.075691   
6680            0.303540    0.358588                  0.081952   
5855            1.816925    0.285117                  0.136801   
...                  ...         ...                       ...   
6520            0.337106    0.030071                  0.194442   
8482            1.970142    0.158172                  0.132746   
6877            0.730339    0.983954                  0.122113   
4907            1.158970    0.778371                  0.216804   
9991            0.396137    0.547157                  0.580031   

      number_of_schools  number_of_hospitals  number_of_parks  \
5846           0.904290             0.734323         0.042640   
7544           0.921857             0.724780         1.665695   
4545        

<Figure size 1000x600 with 0 Axes>

trying some changes on the data

In [None]:
from xgboost import XGBClassifier
from sklearn.feature_selection import SelectFromModel

model = XGBClassifier()
model.fit(X_scaled, Y)
importances = model.feature_importances_
threshold = 0.
important_features = importances > threshold
X_selected = X_scaled[:, important_features]
test_data_selected = test_data.iloc[:, important_features]

print(f"Selected Features: {len(important_features)}")

# Optional: View the first few rows of the selected data
print("Selected data (X):")
print(X_selected[:5])
print("Selected test data:")
print(test_data_selected.head())

In [52]:
X_scaled['community_services'] = X_scaled['number_of_schools'] + X_scaled['number_of_hospitals'] + X_scaled['number_of_parks']
X_scaled['accessibility'] = X_scaled['public_transport_availability'] * X_scaled['road_density']
X_scaled.shape
X_new = X_scaled
X_new = X_scaled.drop(columns=['number_of_schools', 'number_of_hospitals', 'number_of_parks', 'public_transport_availability', 'road_density'])

test_data_scaled['community_services'] = test_data_scaled['number_of_schools'] + test_data_scaled['number_of_hospitals'] + test_data_scaled['number_of_parks']
test_data_scaled['accessibility'] = test_data_scaled['public_transport_availability'] * test_data_scaled['road_density']
test_data_scaled.shape
test_data_new = test_data_scaled.drop(columns=['number_of_schools', 'number_of_hospitals', 'number_of_parks', 'public_transport_availability', 'road_density'])

split data

In [49]:
train_x, test_x, train_y, test_y = train_test_split(X_new, Y, test_size=0.2, random_state=42)
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape)
# train_x, train_y = X, Y


(8000, 17) (2000, 17) (8000,) (2000,)


SVM model

In [30]:
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier


model = svm.SVC(kernel='linear', gamma='scale' , C=0.1)
model.fit(train_x,train_y)

# ovo_model = OneVsOneClassifier(model)
# ovo_model.fit(train_x, train_y)
y_pred = model.predict(test_x)

# Evaluate the model
print(f"Test Accuracy: {accuracy_score(test_y, y_pred)}")
print(f"Classification Report for SVM:\n{classification_report(test_y, y_pred)}")

Test Accuracy: 0.215
Classification Report for SVM:
              precision    recall  f1-score   support

           0       0.26      0.30      0.28       415
           1       0.18      0.14      0.16       395
           2       0.19      0.16      0.17       387
           3       0.22      0.25      0.23       410
           4       0.20      0.22      0.21       393

    accuracy                           0.21      2000
   macro avg       0.21      0.21      0.21      2000
weighted avg       0.21      0.21      0.21      2000



rf model 

In [50]:
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier

# rfm = RandomForestClassifier(max_depth=10, min_samples_leaf=4, min_samples_split=5, n_estimators=100, random_state=42)
rfm = RandomForestClassifier(random_state=42, n_estimators=200)
# rfm.fit(X, Y)
rfm.fit(train_x, train_y)
y_pred_rf = rfm.predict(test_x)

print(f"Test Accuracy for Random Forest: {accuracy_score(test_y, y_pred_rf)}")
print(f"Classification Report for Random Forest:\n{classification_report(test_y, y_pred_rf)}")


Test Accuracy for Random Forest: 0.61
Classification Report for Random Forest:
              precision    recall  f1-score   support

           0       0.91      0.79      0.85       415
           1       0.34      0.42      0.38       395
           2       0.34      0.42      0.38       387
           3       0.72      0.57      0.64       410
           4       0.93      0.84      0.88       393

    accuracy                           0.61      2000
   macro avg       0.65      0.61      0.62      2000
weighted avg       0.65      0.61      0.63      2000



NN model 

In [37]:
# print(train_y.shape)

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import accuracy_score, classification_report

train_y = to_categorical(train_y)
test_y = to_categorical(test_y)


model = Sequential([
    Dense(128, activation='relu', input_shape=(train_x.shape[1],)),  
    Dropout(0.3),  
    Dense(64, activation='relu'), 
    Dropout(0.3),
    Dense(train_y.shape[1], activation='softmax')  
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
history = model.fit(train_x, train_y, validation_split=0.2, epochs=50, batch_size=32, verbose=1)

# Évaluation du modèle
test_loss, test_accuracy = model.evaluate(test_x, test_y)
print(f"Loss sur le test : {test_loss:.4f}")
print(f"Précision sur le test : {test_accuracy:.4f}")



# y_true = np.argmax(test_y, axis=1)
# print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")
# print(f"Classification Report:\n{classification_report(y_true, y_pred)}")
# # Prédictions
# y_pred = model.predict(test_x)
# print(f"Test Accuracy for Random Forest: {accuracy_score(test_y, y_pred_rf)}")
# print(f"Classification Report for Random Forest:\n{classification_report(test_y, y_pred_rf)}")


Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.1991 - loss: 1.6975 - val_accuracy: 0.2194 - val_loss: 1.6110
Epoch 2/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.2188 - loss: 1.6148 - val_accuracy: 0.2113 - val_loss: 1.6080
Epoch 3/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.2324 - loss: 1.6085 - val_accuracy: 0.2288 - val_loss: 1.6056
Epoch 4/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.2507 - loss: 1.5907 - val_accuracy: 0.2288 - val_loss: 1.6021
Epoch 5/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.2615 - loss: 1.5825 - val_accuracy: 0.2544 - val_loss: 1.5992
Epoch 6/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.2637 - loss: 1.5800 - val_accuracy: 0.2419 - val_loss: 1.5964
Epoch 7/50
[1m200/200[0m [32m━━━━━━━

In [43]:
y_pred = np.argmax(model.predict(test_x), axis=1)
y_pred +=1
y_true = np.argmax(test_y, axis=1)
y_true +=1
y_true[:10], y_pred[:10]
print(f"Test Accuracy for NN: {accuracy_score(y_true, y_pred)}")
print(f"Classification Report for Random Forest:\n{classification_report(y_true, y_pred)}")

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Test Accuracy for NN: 0.383
Classification Report for Random Forest:
              precision    recall  f1-score   support

           1       0.51      0.51      0.51       415
           2       0.26      0.18      0.21       395
           3       0.26      0.26      0.26       387
           4       0.44      0.34      0.38       410
           5       0.41      0.62      0.49       393

    accuracy                           0.38      2000
   macro avg       0.37      0.38      0.37      2000
weighted avg       0.38      0.38      0.37      2000



export result to csv from the test data

In [53]:
# output = np.argmax(model.predict(test_data), axis=1)
# output += 1
output = rfm.predict(test_data_new) #here changing the model 
output += 1
submission = pd.DataFrame({
    "ID": test_data.index + 1,  
    "development_trend_score": output
})
submission_file_path = './test/development_trend_predictions_classification.csv'
submission.to_csv(submission_file_path, index=False)
print(f"Submission file saved at: {submission_file_path}")

Submission file saved at: ./test/development_trend_predictions_classification.csv


In [16]:
# Train a Gaussian Naive Bayes model
nb_model = GaussianNB()
nb_model.fit(train_x, train_y)

# Predict on the test set
nb_y_pred = nb_model.predict(test_x)

# Evaluate the model
print(f"Test Accuracy: {accuracy_score(test_y, nb_y_pred)}")
print(f"Classification Report for Gaussian Naive Bayes:\n{classification_report(test_y, nb_y_pred)}")

Test Accuracy: 0.3995
Classification Report for Gaussian Naive Bayes:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       184
           1       0.12      0.01      0.01       387
           2       0.40      0.99      0.57       808
           3       0.00      0.00      0.00       410
           4       0.00      0.00      0.00       211

    accuracy                           0.40      2000
   macro avg       0.11      0.20      0.12      2000
weighted avg       0.19      0.40      0.23      2000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
