In [370]:
# !pip install pyspark
# !pip install imblearn

In [371]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import math
import sklearn.model_selection as model_selection
import pyspark
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.linear_model import Perceptron
from sklearn.model_selection import KFold, cross_val_score,StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from imblearn.over_sampling import RandomOverSampler,SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression

### Read the data

In [372]:
df = pd.read_csv('body_level_classification_train.csv')
df.head()

Unnamed: 0,Gender,Age,Height,Weight,H_Cal_Consump,Veg_Consump,Water_Consump,Alcohol_Consump,Smoking,Meal_Count,Food_Between_Meals,Fam_Hist,H_Cal_Burn,Phys_Act,Time_E_Dev,Transport,Body_Level
0,Female,22.547298,1.722461,51.881263,yes,2.663421,1.04111,no,no,3.0,Frequently,yes,no,0.794402,1.391948,Public_Transportation,Body Level 1
1,Male,19.799054,1.743702,54.927529,yes,2.0,2.847264,Sometimes,no,3.28926,Sometimes,yes,no,1.680844,2.0,Public_Transportation,Body Level 1
2,Female,17.823438,1.708406,50.0,yes,1.642241,1.099231,Sometimes,no,3.45259,Sometimes,no,no,0.418875,1.0,Public_Transportation,Body Level 1
3,Female,19.007177,1.690727,49.895716,yes,1.212908,1.029703,Sometimes,no,3.207071,Sometimes,no,no,2.0,1.0,Public_Transportation,Body Level 1
4,Male,19.72925,1.793315,58.19515,yes,2.508835,2.076933,no,no,3.435905,Sometimes,yes,no,2.026668,1.443328,Automobile,Body Level 1


### Convert Categorical to Numerical

In [373]:
for col in ["Gender", "H_Cal_Consump", "Alcohol_Consump", "Smoking", "Food_Between_Meals", "Fam_Hist", "H_Cal_Burn", "Transport","Body_Level"]:
    df[col] = pd.Categorical(df[col], categories=df[col].unique()).codes
    

### multi_class Parameter:
The handling of multinomial classification problems. This can be 'auto', 'ovr', or 'multinomial'. The default value is 'auto'.

In [374]:
# separate the features and target variable
X = df.drop('Body_Level', axis=1)
y = df['Body_Level']


# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y,test_size=0.3, random_state=42) 


# Logistic Regression
for multi_class in ('multinomial', 'ovr'):
    print(f"Training Logistic Regression Classifier with {multi_class}")
    model = LogisticRegression(multi_class = multi_class,max_iter = 30000).fit(X_train, y_train)

    # Predicting the Test set results
    y_train_predict = model.predict(X_train)
    print(f"Training Accurracy: {accuracy_score(y_train, y_train_predict) * 100}%")
    y_test_predict = model.predict(X_test)
    print(f"Testing Accurracy: {accuracy_score(y_test, y_test_predict) * 100}%")


    # create a KFold object
    kf = KFold(n_splits=10, shuffle=True, random_state=42)

    # perform cross-validation
    scores = cross_val_score(model, X, y, cv=kf, scoring='f1_macro')

    # Print the classification report and mean F1 score
    print(classification_report(y_test, model.predict(X_test)))
    print("Mean F1 score:", scores.mean())

    # print the mean and standard deviation of the scores
    print(f"Accuracy after applying cross validation: {scores.mean():.5f} (+/- {scores.std():.5f})")
    print("------------------------------------------------------------------------")

Training Logistic Regression Classifier with multinomial
Training Accurracy: 89.44820909970959%
Testing Accurracy: 86.93693693693693%
              precision    recall  f1-score   support

           0       0.81      0.95      0.87        57
           1       0.83      0.56      0.67        61
           2       0.78      0.91      0.84       122
           3       0.96      0.92      0.94       204

    accuracy                           0.87       444
   macro avg       0.85      0.83      0.83       444
weighted avg       0.88      0.87      0.87       444

Mean F1 score: 0.8283040110496392
Accuracy after applying cross validation: 0.82830 (+/- 0.02952)
------------------------------------------------------------------------
Training Logistic Regression Classifier with ovr
Training Accurracy: 84.99515972894483%
Testing Accurracy: 81.08108108108108%
              precision    recall  f1-score   support

           0       0.77      0.95      0.85        57
           1       0.63  

### solver Parameter: 
The algorithm to be used for optimization. This can be 'newton-cg', 'lbfgs', 'liblinear', 'sag', or 'saga'. The default value is 'lbfgs'.



In [375]:
# separate the features and target variable
X = df.drop('Body_Level', axis=1)
y = df['Body_Level']


# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y,test_size=0.3, random_state=42) 


# Logistic Regression
for solver in ('newton-cg', 'lbfgs', 'sag', 'saga'):
    print(f"Training Logistic Regression Classifier with {solver} solver")
    model = LogisticRegression(multi_class = "multinomial",solver = solver,max_iter = 30000).fit(X_train, y_train)

    # Predicting the Test set results
    y_train_predict = model.predict(X_train)
    print(f"Training Accurracy: {accuracy_score(y_train, y_train_predict) * 100}%")
    y_test_predict = model.predict(X_test)
    print(f"Testing Accurracy: {accuracy_score(y_test, y_test_predict) * 100}%")


    # create a KFold object
    kf = KFold(n_splits=10, shuffle=True, random_state=42)

    # perform cross-validation
    scores = cross_val_score(model, X, y, cv=kf, scoring='f1_macro')

    # Print the classification report and mean F1 score
    print(classification_report(y_test, model.predict(X_test)))
    print("Mean F1 score:", scores.mean())

    # print the mean and standard deviation of the scores
    print(f"Accuracy after applying cross validation: {scores.mean():.5f} (+/- {scores.std():.5f})")
    print("------------------------------------------------------------------------")

Training Logistic Regression Classifier with newton-cg solver
Training Accurracy: 89.351403678606%
Testing Accurracy: 86.93693693693693%
              precision    recall  f1-score   support

           0       0.81      0.95      0.87        57
           1       0.83      0.56      0.67        61
           2       0.78      0.91      0.84       122
           3       0.96      0.92      0.94       204

    accuracy                           0.87       444
   macro avg       0.85      0.83      0.83       444
weighted avg       0.88      0.87      0.87       444

Mean F1 score: 0.8283040110496392
Accuracy after applying cross validation: 0.82830 (+/- 0.02952)
------------------------------------------------------------------------
Training Logistic Regression Classifier with lbfgs solver
Training Accurracy: 89.44820909970959%
Testing Accurracy: 86.93693693693693%
              precision    recall  f1-score   support

           0       0.81      0.95      0.87        57
           1 

#### penalty Parameter: 
The regularization penalty to be used. This can be 'l1', 'l2', 'elasticnet', or 'none'. The default value is 'l2'


In [376]:
# separate the features and target variable
X = df.drop('Body_Level', axis=1)
y = df['Body_Level']


# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y,test_size=0.3, random_state=42) 


# Logistic Regression
penalty = 'l2'
print(f"Training Logistic Regression Classifier with {penalty} Regularization")
model = LogisticRegression(multi_class = "multinomial",solver = 'lbfgs', penalty = penalty,max_iter = 50000).fit(X_train, y_train)

# Predicting the Test set results
y_train_predict = model.predict(X_train)
print(f"Training Accurracy: {accuracy_score(y_train, y_train_predict) * 100}%")
y_test_predict = model.predict(X_test)
print(f"Testing Accurracy: {accuracy_score(y_test, y_test_predict) * 100}%")


# create a KFold object
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# perform cross-validation
scores = cross_val_score(model, X, y, cv=kf, scoring='f1_macro')

# Print the classification report and mean F1 score
print(classification_report(y_test, model.predict(X_test)))
print("Mean F1 score:", scores.mean())

# print the mean and standard deviation of the scores
print(f"Accuracy after applying cross validation: {scores.mean():.5f} (+/- {scores.std():.5f})")
print("------------------------------------------------------------------------")

Training Logistic Regression Classifier with l2 Regularization
Training Accurracy: 89.44820909970959%
Testing Accurracy: 86.93693693693693%
              precision    recall  f1-score   support

           0       0.81      0.95      0.87        57
           1       0.83      0.56      0.67        61
           2       0.78      0.91      0.84       122
           3       0.96      0.92      0.94       204

    accuracy                           0.87       444
   macro avg       0.85      0.83      0.83       444
weighted avg       0.88      0.87      0.87       444

Mean F1 score: 0.8283040110496392
Accuracy after applying cross validation: 0.82830 (+/- 0.02952)
------------------------------------------------------------------------


#### C Parameter: 
The inverse of the regularization strength. Smaller values of C specify stronger regularization. The default value is 1.0.

In [377]:
# separate the features and target variable
X = df.drop('Body_Level', axis=1)
y = df['Body_Level']


# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y,test_size=0.3, random_state=42) 


# Logistic Regression
for c in [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8]:
    print(f"Training Logistic Regression Classifier with {c} Regularization strength")
    model = LogisticRegression(multi_class = "multinomial",solver = 'lbfgs', penalty = 'l2', C=c ,max_iter = 50000).fit(X_train, y_train)

    # Predicting the Test set results
    y_train_predict = model.predict(X_train)
    print(f"Training Accurracy: {accuracy_score(y_train, y_train_predict) * 100}%")
    y_test_predict = model.predict(X_test)
    print(f"Testing Accurracy: {accuracy_score(y_test, y_test_predict) * 100}%")


    # create a KFold object
    kf = KFold(n_splits=10, shuffle=True, random_state=42)

    # perform cross-validation
    scores = cross_val_score(model, X, y, cv=kf, scoring='f1_macro')

    # Print the classification report and mean F1 score
    print(classification_report(y_test, model.predict(X_test)))
    print("Mean F1 score:", scores.mean())

    # print the mean and standard deviation of the scores
    print(f"Accuracy after applying cross validation: {scores.mean():.5f} (+/- {scores.std():.5f})")
    print("------------------------------------------------------------------------")

Training Logistic Regression Classifier with 0.1 Regularization strength
Training Accurracy: 84.12391093901257%
Testing Accurracy: 80.85585585585585%
              precision    recall  f1-score   support

           0       0.76      0.93      0.83        57
           1       0.71      0.44      0.55        61
           2       0.70      0.84      0.76       122
           3       0.94      0.86      0.90       204

    accuracy                           0.81       444
   macro avg       0.77      0.77      0.76       444
weighted avg       0.82      0.81      0.80       444

Mean F1 score: 0.7807256306708318
Accuracy after applying cross validation: 0.78073 (+/- 0.04378)
------------------------------------------------------------------------
Training Logistic Regression Classifier with 0.2 Regularization strength
Training Accurracy: 86.64085188770572%
Testing Accurracy: 83.55855855855856%
              precision    recall  f1-score   support

           0       0.77      0.95      

#### class_weight Parameter: 
The weights to assign to each class. This can be 'balanced' to automatically adjust the weights based on the class frequencies, or a dictionary with custom weights.

In [378]:
# separate the features and target variable
X = df.drop('Body_Level', axis=1)
y = df['Body_Level']


# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y,test_size=0.3, random_state=42) 


# we will use the class_weight parameter to weight the classes
frequencies = df['Body_Level'].value_counts()
count = df['Body_Level'].count()
freq_level1,freq_level2,freq_level3,freq_level4 = frequencies[0]/count,frequencies[1]/count,frequencies[2]/count,frequencies[3]/count
class_weight =  {0: freq_level1, 1: freq_level2, 2: freq_level3, 3: freq_level4}

# Logistic Regression
model = LogisticRegression(class_weight=class_weight,multi_class = "multinomial",solver = 'lbfgs', penalty = 'l2', C=0.8 ,max_iter = 50000).fit(X_train, y_train)


# Predicting the Test set results
y_train_predict = model.predict(X_train)
print(f"Training Accurracy: {accuracy_score(y_train, y_train_predict) * 100}%")
y_test_predict = model.predict(X_test)
print(f"Testing Accurracy: {accuracy_score(y_test, y_test_predict) * 100}%")


# create a KFold object
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# perform cross-validation
scores = cross_val_score(model, X, y, cv=kf, scoring='f1_macro')

# Print the classification report and mean F1 score
print(classification_report(y_test, model.predict(X_test)))
print("Mean F1 score:", scores.mean())

# print the mean and standard deviation of the scores
print(f"Accuracy after applying cross validation: {scores.mean():.5f} (+/- {scores.std():.5f})")


Training Accurracy: 86.44724104549854%
Testing Accurracy: 81.98198198198197%
              precision    recall  f1-score   support

           0       0.75      0.93      0.83        57
           1       0.79      0.31      0.45        61
           2       0.73      0.82      0.77       122
           3       0.91      0.94      0.92       204

    accuracy                           0.82       444
   macro avg       0.79      0.75      0.74       444
weighted avg       0.82      0.82      0.80       444

Mean F1 score: 0.7711715016511844
Accuracy after applying cross validation: 0.77117 (+/- 0.05104)


### Apply Principle Component Analysis on the features 

In [379]:
# separate the features and target variable
X = df.drop('Body_Level', axis=1)
y = df['Body_Level']

pca = PCA(n_components=13)
# Fit the PCA model to the data
pca.fit(X)
# Transform the data using the fitted PCA model
X = pca.transform(X)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y,test_size=0.3, random_state=42) 


# Logistic Regression
model = LogisticRegression(multi_class = "multinomial",solver = 'lbfgs', penalty = 'l2', C = 0.8 ,max_iter = 50000).fit(X_train, y_train)


# Predicting the Test set results
y_train_predict = model.predict(X_train)
print(f"Training Accurracy: {accuracy_score(y_train, y_train_predict) * 100}%")
y_test_predict = model.predict(X_test)
print(f"Testing Accurracy: {accuracy_score(y_test, y_test_predict) * 100}%")


# create a KFold object
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# perform cross-validation
scores = cross_val_score(model, X, y, cv=kf, scoring='f1_macro')

# Print the classification report and mean F1 score
print(classification_report(y_test, model.predict(X_test)))
print("Mean F1 score:", scores.mean())

# print the mean and standard deviation of the scores
print(f"Accuracy after applying cross validation: {scores.mean():.5f} (+/- {scores.std():.5f})")

Training Accurracy: 87.89932236205227%
Testing Accurracy: 85.13513513513513%
              precision    recall  f1-score   support

           0       0.79      0.95      0.86        57
           1       0.80      0.54      0.65        61
           2       0.77      0.85      0.81       122
           3       0.94      0.92      0.93       204

    accuracy                           0.85       444
   macro avg       0.83      0.81      0.81       444
weighted avg       0.85      0.85      0.85       444

Mean F1 score: 0.7996258713539676
Accuracy after applying cross validation: 0.79963 (+/- 0.04185)


#### Normalize the features

In [380]:
# separate the features and target variable
X = df.drop('Body_Level', axis=1)
y = df['Body_Level']


# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y,test_size=0.3, random_state=42) 


# Normalized data
stand = preprocessing.StandardScaler()
X_train = stand.fit_transform(X_train)
X_test = stand.fit_transform(X_test)



# Logistic Regression
model = LogisticRegression(multi_class = "multinomial",solver = 'lbfgs', penalty = 'l2', C = 0.8 ,max_iter = 50000).fit(X_train, y_train)


# Predicting the Test set results
y_train_predict = model.predict(X_train)
print(f"Training Accurracy: {accuracy_score(y_train, y_train_predict) * 100}%")
y_test_predict = model.predict(X_test)
print(f"Testing Accurracy: {accuracy_score(y_test, y_test_predict) * 100}%")


# create a KFold object
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# perform cross-validation
scores = cross_val_score(model, X, y, cv=kf, scoring='f1_macro')

# Print the classification report and mean F1 score
print(classification_report(y_test, model.predict(X_test)))
print("Mean F1 score:", scores.mean())

# print the mean and standard deviation of the scores
print(f"Accuracy after applying cross validation: {scores.mean():.5f} (+/- {scores.std():.5f})")


Training Accurracy: 94.28848015488867%
Testing Accurracy: 92.11711711711712%
              precision    recall  f1-score   support

           0       0.80      1.00      0.89        57
           1       0.97      0.52      0.68        61
           2       0.87      0.97      0.92       122
           3       0.99      0.99      0.99       204

    accuracy                           0.92       444
   macro avg       0.91      0.87      0.87       444
weighted avg       0.93      0.92      0.91       444

Mean F1 score: 0.8187669503705208
Accuracy after applying cross validation: 0.81877 (+/- 0.03295)


#### Over Sampling to handle class imbalance

In [381]:
# separate the features and target variable
X = df.drop('Body_Level', axis=1)
y = df['Body_Level']


# Oversampling to balance the data
smote = SMOTE(random_state=42)
X, y = smote.fit_resample(X, y)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y,test_size=0.3, random_state=42) 


# Normalized data
stand = preprocessing.StandardScaler()
X_train = stand.fit_transform(X_train)
X_test = stand.fit_transform(X_test)

# Logistic Regression
model = LogisticRegression(multi_class = "multinomial",solver = 'lbfgs', penalty = 'l2', C = 0.8 ,max_iter = 50000).fit(X_train, y_train)


# Predicting the Test set results
y_train_predict = model.predict(X_train)
print(f"Training Accurracy: {accuracy_score(y_train, y_train_predict) * 100}%")
y_test_predict = model.predict(X_test)
print(f"Testing Accurracy: {accuracy_score(y_test, y_test_predict) * 100}%")


# create a KFold object
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# perform cross-validation
scores = cross_val_score(model, X, y, cv=kf, scoring='f1_macro')

# Print the classification report and mean F1 score
print(classification_report(y_test, model.predict(X_test)))
print("Mean F1 score:", scores.mean())

# print the mean and standard deviation of the scores
print(f"Accuracy after applying cross validation: {scores.mean():.5f} (+/- {scores.std():.5f})")


Training Accurracy: 96.69117647058823%
Testing Accurracy: 96.69117647058823%
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       204
           1       0.99      0.93      0.96       204
           2       0.93      0.98      0.95       204
           3       0.98      0.97      0.98       204

    accuracy                           0.97       816
   macro avg       0.97      0.97      0.97       816
weighted avg       0.97      0.97      0.97       816

Mean F1 score: 0.8713992065829025
Accuracy after applying cross validation: 0.87140 (+/- 0.02132)


### Select numerical features

In [382]:
# separate the features and target variable
X = df[['Age', 'Height', 'Weight','Veg_Consump','Water_Consump','Meal_Count','Phys_Act','Time_E_Dev']]
y = df['Body_Level']


# Oversampling to balance the data
smote = SMOTE(random_state=42)
X, y = smote.fit_resample(X, y)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y,test_size=0.3, random_state=42) 


# Normalized data
stand = preprocessing.StandardScaler()
X_train = stand.fit_transform(X_train)
X_test = stand.fit_transform(X_test)

# Logistic Regression
model = LogisticRegression(multi_class = "multinomial",solver = 'lbfgs', penalty = 'l2', C = 0.8 ,max_iter = 50000).fit(X_train, y_train)


# Predicting the Test set results
y_train_predict = model.predict(X_train)
print(f"Training Accurracy: {accuracy_score(y_train, y_train_predict) * 100}%")
y_test_predict = model.predict(X_test)
print(f"Testing Accurracy: {accuracy_score(y_test, y_test_predict) * 100}%")


# create a KFold object
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# perform cross-validation
scores = cross_val_score(model, X, y, cv=kf, scoring='f1_macro')

# Print the classification report and mean F1 score
print(classification_report(y_test, model.predict(X_test)))
print("Mean F1 score:", scores.mean())

# print the mean and standard deviation of the scores
print(f"Accuracy after applying cross validation: {scores.mean():.5f} (+/- {scores.std():.5f})")


Training Accurracy: 96.84873949579831%
Testing Accurracy: 96.20098039215686%
              precision    recall  f1-score   support

           0       0.94      1.00      0.97       204
           1       0.98      0.90      0.94       204
           2       0.93      0.98      0.96       204
           3       1.00      0.97      0.99       204

    accuracy                           0.96       816
   macro avg       0.96      0.96      0.96       816
weighted avg       0.96      0.96      0.96       816

Mean F1 score: 0.8321557762678594
Accuracy after applying cross validation: 0.83216 (+/- 0.02564)
