In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
model = LogisticRegression(max_iter=100000)
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from statsmodels.stats.contingency_tables import mcnemar
from sklearn.preprocessing import LabelEncoder, StandardScaler
import time
from sklearn.datasets import load_iris

In [17]:
#read data and assign attribute according to the description on the website
column_names = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 
                'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 
                'hours_per_week', 'native_country', 'income']
data = pd.read_csv('adult.data', header=None, names=column_names)

# check few rows
print(data)

       age          workclass  fnlwgt    education  education_num  \
0       39          State-gov   77516    Bachelors             13   
1       50   Self-emp-not-inc   83311    Bachelors             13   
2       38            Private  215646      HS-grad              9   
3       53            Private  234721         11th              7   
4       28            Private  338409    Bachelors             13   
...    ...                ...     ...          ...            ...   
32556   27            Private  257302   Assoc-acdm             12   
32557   40            Private  154374      HS-grad              9   
32558   58            Private  151910      HS-grad              9   
32559   22            Private  201490      HS-grad              9   
32560   52       Self-emp-inc  287927      HS-grad              9   

            marital_status          occupation    relationship    race  \
0            Never-married        Adm-clerical   Not-in-family   White   
1       Married-civ-spo

In [18]:
# Handle missing values
data = data.replace(' ?',np.nan)  # replace "?" with NaN
data = data.dropna()  # drop rows with missing values

# Encode categorical variables
categorical_cols = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country','income']
encoder = LabelEncoder()
for col in categorical_cols:
    data[col] = encoder.fit_transform(data[col])

# Scale numeric variables
numeric_cols = ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']
scaler = StandardScaler()
data[numeric_cols] = scaler.fit_transform(data[numeric_cols])

# Save the cleaned and preprocessed data to a new CSV file
data.to_csv('preprocessed_adult.csv', index=False)
print(data)

            age  workclass    fnlwgt  education  education_num  \
0      0.042796          5 -1.062722          9       1.128918   
1      0.880288          4 -1.007871          9       1.128918   
2     -0.033340          2  0.244693         11      -0.439738   
3      1.108695          2  0.425240          1      -1.224066   
4     -0.794697          2  1.406658          9       1.128918   
...         ...        ...       ...        ...            ...   
32556 -0.870832          2  0.638972          7       0.736754   
32557  0.118931          2 -0.335252         11      -0.439738   
32558  1.489374          2 -0.358575         11      -0.439738   
32559 -1.251511          2  0.110705         11      -0.439738   
32560  1.032559          3  0.928841         11      -0.439738   

       marital_status  occupation  relationship  race  sex  capital_gain  \
0                   4           0             1     4    1      0.146092   
1                   2           3             0     4  

In [24]:
iris = load_iris()
X, y = iris.data, iris.target

start_time = time.time()
# Split the data
X = data.drop(['income'], axis=1)
y = data['income']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train the models
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Evaluate the models
rf_pred = rf_model.predict(X_test)

end_time = time.time()
print('Random Forest:')
print('Accuracy:', accuracy_score(y_test, rf_pred))
print('Precision:', precision_score(y_test, rf_pred))
print('Recall:', recall_score(y_test, rf_pred))
print('F1 Score:', f1_score(y_test, rf_pred))
# Calculate the time difference and print it
training_time = end_time - start_time
print("Training time: %.2f seconds" % training_time)


Random Forest:
Accuracy: 0.85545364128633
Precision: 0.7520703933747412
Recall: 0.6367221735319895
F1 Score: 0.6896060749881348
Training time: 1.31 seconds


In [26]:
iris = load_iris()
X, y = iris.data, iris.target

start_time = time.time()
# Split the data
X = data.drop(['income'], axis=1)
y = data['income']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train the models
lr_model = LogisticRegression(solver='sag',random_state=42)
lr_model.fit(X_train, y_train)

# Evaluate the models
lr_pred = lr_model.predict(X_test)

end_time = time.time()
print('Logistic Regression:')
print('Accuracy:', accuracy_score(y_test, lr_pred))
print('Precision:', precision_score(y_test, lr_pred))
print('Recall:', recall_score(y_test, lr_pred))
print('F1 Score:', f1_score(y_test, lr_pred))
training_time = end_time - start_time
print('Training time: %.2f seconds' % training_time)

Logistic Regression:
Accuracy: 0.8188750138136811
Precision: 0.7237299930410578
Recall: 0.45574057843996496
F1 Score: 0.5592901317558483
Training time: 0.16 seconds
