<a href="https://colab.research.google.com/github/neriandria/MachineLearning/blob/main/MachineLearning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [62]:
pip install numpy pandas scikit-learn




In [63]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [64]:
df = pd.read_csv("Adult data.csv")

In [65]:
df.head()

Unnamed: 0,Age,Workclass,Fnlwgt,Education,Education-num,Marital-status,Occupation,Relationship,Race,Sex,Capital-gain,Capital-loss,Hours-per-week,Native-country,Income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [66]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Age             32561 non-null  int64 
 1   Workclass       32561 non-null  object
 2   Fnlwgt          32561 non-null  int64 
 3   Education       32561 non-null  object
 4   Education-num   32561 non-null  int64 
 5   Marital-status  32561 non-null  object
 6   Occupation      32561 non-null  object
 7   Relationship    32561 non-null  object
 8   Race            32561 non-null  object
 9   Sex             32561 non-null  object
 10  Capital-gain    32561 non-null  int64 
 11  Capital-loss    32561 non-null  int64 
 12  Hours-per-week  32561 non-null  int64 
 13  Native-country  32561 non-null  object
 14  Income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [67]:
df.describe()

Unnamed: 0,Age,Fnlwgt,Education-num,Capital-gain,Capital-loss,Hours-per-week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [68]:
#Find rows with "?" values in Adult dataset
question_mark_rows = df[df.apply(lambda row: row.astype(str).str.contains('\?')).any(axis=1)]

print("Rows with '?' values in Adult dataset:")
print(question_mark_rows)

Rows with '?' values in Adult dataset:
       Age Workclass  Fnlwgt      Education  Education-num  \
14      40   Private  121772      Assoc-voc             11   
27      54         ?  180211   Some-college             10   
38      31   Private   84154   Some-college             10   
51      18   Private  226956        HS-grad              9   
61      32         ?  293936        7th-8th              4   
...    ...       ...     ...            ...            ...   
32530   35         ?  320084      Bachelors             13   
32531   30         ?   33811      Bachelors             13   
32539   71         ?  287372      Doctorate             16   
32541   41         ?  202822        HS-grad              9   
32542   72         ?  129912        HS-grad              9   

               Marital-status      Occupation    Relationship  \
14         Married-civ-spouse    Craft-repair         Husband   
27         Married-civ-spouse               ?         Husband   
38         Married-ci

In [69]:
# Replace "?" values with mode of respective columns
for column in df.columns:
    if df[column].dtype == 'object':  # Check if the column is categorical
        df[column] = df[column].replace(' ?', df[column].mode()[0])


In [70]:
print(df)
print(df.isnull().sum())


       Age          Workclass  Fnlwgt    Education  Education-num  \
0       39          State-gov   77516    Bachelors             13   
1       50   Self-emp-not-inc   83311    Bachelors             13   
2       38            Private  215646      HS-grad              9   
3       53            Private  234721         11th              7   
4       28            Private  338409    Bachelors             13   
...    ...                ...     ...          ...            ...   
32556   27            Private  257302   Assoc-acdm             12   
32557   40            Private  154374      HS-grad              9   
32558   58            Private  151910      HS-grad              9   
32559   22            Private  201490      HS-grad              9   
32560   52       Self-emp-inc  287927      HS-grad              9   

            Marital-status          Occupation    Relationship    Race  \
0            Never-married        Adm-clerical   Not-in-family   White   
1       Married-civ-spo

In [71]:
# Split the dataset into features (X) and target variable (y)
X = df.drop(columns=['Income'])
y = df['Income']

# Encode categorical variables using one-hot encoding
X_encoded = pd.get_dummies(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)


In [72]:
# Encode 'Income' column as True for '<=50K' and False for '>50K'
df['Income'] = df['Income'].apply(lambda x: x.strip() == '<=50K')

In [73]:
print(df.head())

   Age          Workclass  Fnlwgt   Education  Education-num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        Marital-status          Occupation    Relationship    Race      Sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   Capital-gain  Capital-loss  Hours-per-week  Native-country  Income  
0          2174             0              40   United-States    Tru

In [74]:
# Find duplicate rows
duplicate_rows = df.duplicated()

# Print duplicate rows
print("Duplicate Rows:")
print(df[duplicate_rows])

# Remove duplicate rows
df_cleaned = df[~duplicate_rows]

# Print information about removed duplicates
print("\nNumber of duplicate rows removed:", sum(duplicate_rows))
print("Shape of the cleaned dataset:", df_cleaned.shape)


Duplicate Rows:
       Age          Workclass  Fnlwgt      Education  Education-num  \
4881    25            Private  308144      Bachelors             13   
5104    90            Private   52386   Some-college             10   
9171    21            Private  250051   Some-college             10   
11631   20            Private  107658   Some-college             10   
13084   25            Private  195994        1st-4th              2   
15059   21            Private  243368      Preschool              1   
17040   46            Private  173243        HS-grad              9   
18555   30            Private  144593        HS-grad              9   
18698   19            Private   97261        HS-grad              9   
21318   19            Private  138153   Some-college             10   
21490   19            Private  146679   Some-college             10   
21875   49            Private   31267        7th-8th              4   
22300   25            Private  195994        1st-4th         

In [75]:
# Assuming df contains your preprocessed data

# Define features (X) and target (y)
X = df.drop('Income', axis=1)  # Features are all columns except 'Income'
y = df['Income']  # Target is the 'Income' column

# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [76]:
# Random Forest Classification
# One-hot encode categorical variables for both training and testing sets
X_train_encoded = pd.get_dummies(X_train)
X_test_encoded = pd.get_dummies(X_test)

# Align the columns of the testing set with the columns of the training set
X_test_encoded = X_test_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)

# Train the RandomForestClassifier
rf_classifier.fit(X_train_encoded, y_train)

# Predictions
rf_pred = rf_classifier.predict(X_test_encoded)

# Evaluation
print("\nRandom Forest Classification Report:")
print(classification_report(y_test, rf_pred))
print("Random Forest Accuracy:", accuracy_score(y_test, rf_pred))


Random Forest Classification Report:
              precision    recall  f1-score   support

       False       0.74      0.64      0.69      1571
        True       0.89      0.93      0.91      4942

    accuracy                           0.86      6513
   macro avg       0.82      0.78      0.80      6513
weighted avg       0.85      0.86      0.86      6513

Random Forest Accuracy: 0.8590511285122063


In [77]:
# Model training
# Naïve Bayes Classification
from sklearn.naive_bayes import GaussianNB
nb_classifier = GaussianNB()

# Train the Naïve Bayes classifier
nb_classifier.fit(X_train_encoded, y_train)

# Predictions
nb_pred = nb_classifier.predict(X_test_encoded)

# Evaluation
print("Naïve Bayes Classification Report:")
print(classification_report(y_test, nb_pred))
print("Naïve Bayes Accuracy:", accuracy_score(y_test, nb_pred))


Naïve Bayes Classification Report:
              precision    recall  f1-score   support

       False       0.68      0.32      0.43      1571
        True       0.81      0.95      0.88      4942

    accuracy                           0.80      6513
   macro avg       0.75      0.64      0.66      6513
weighted avg       0.78      0.80      0.77      6513

Naïve Bayes Accuracy: 0.7991708889912482


In [83]:
# Save the updated dataset to a new CSV file
df.to_csv('Adult_data_imputed.csv', index=False)