In [1]:
import pandas as pd
df = pd.read_csv('adult_with_headers.csv')

In [2]:
df.describe()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
count,17923.0,17923.0,17923.0,17922.0,17922.0,17922.0
mean,38.550243,190185.4,10.090219,1050.742272,87.770282,40.414407
std,13.623031,105443.4,2.559689,7281.141976,402.532485,12.274417
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,118694.5,9.0,0.0,0.0,40.0
50%,37.0,178915.0,10.0,0.0,0.0,40.0
75%,47.5,237849.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [3]:
df.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              1
sex               1
capital_gain      1
capital_loss      1
hours_per_week    1
native_country    1
income            1
dtype: int64

In [4]:
df.dtypes

age                 int64
workclass          object
fnlwgt              int64
education          object
education_num       int64
marital_status     object
occupation         object
relationship       object
race               object
sex                object
capital_gain      float64
capital_loss      float64
hours_per_week    float64
native_country     object
income             object
dtype: object

In [5]:
df = df.dropna()

In [6]:
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = df.select_dtypes(include=['object']).columns

# Create separate dataframes
df_numerical = df[numerical_cols]
df_categorical = df[categorical_cols]

In [7]:
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the numerical data
df_numerical_scaled = scaler.fit_transform(df_numerical)

# Convert the scaled data back to a DataFrame
df_numerical_scaled = pd.DataFrame(df_numerical_scaled, columns=numerical_cols)

In [8]:
df_processed = pd.concat([df_numerical_scaled, df_categorical.reset_index(drop=True)], axis=1)

# Display the first few rows of the processed dataset
print(df_processed.head())

        age    fnlwgt  education_num  capital_gain  capital_loss  \
0  0.032932 -1.068581       1.136750      0.154274     -0.218051   
1  0.840440 -1.013622       1.136750     -0.144314     -0.218051   
2 -0.040477  0.241427      -0.426003     -0.144314     -0.218051   
3  1.060669  0.422333      -1.207380     -0.144314     -0.218051   
4 -0.774575  1.405697       1.136750     -0.144314     -0.218051   

   hours_per_week          workclass   education       marital_status  \
0       -0.033763          State-gov   Bachelors        Never-married   
1       -2.233521   Self-emp-not-inc   Bachelors   Married-civ-spouse   
2       -0.033763            Private     HS-grad             Divorced   
3       -0.033763            Private        11th   Married-civ-spouse   
4       -0.033763            Private   Bachelors   Married-civ-spouse   

           occupation    relationship    race      sex  native_country  income  
0        Adm-clerical   Not-in-family   White     Male   United-States 

Standard Scaling:

Suitable for algorithms that assume normality in the data, such as Linear Regression, Logistic Regression, and Linear Discriminant Analysis.
Useful when features have different variances, as it normalizes the variance across features.
Retains outliers, which can be crucial for certain models that need to capture these variations.

Min-Max Scaling:

Preferred for algorithms that do not assume any distribution, such as Neural Networks, K-Nearest Neighbors, and Principal Component Analysis.
When features are on different scales, Min-Max Scaling brings all features within the same range, making the convergence faster in Gradient Descent-based algorithms.
Effective when dealing with features that have outliers, as it compresses all values within a specific range.

By following these steps, we can ensure that our dataset is clean and well-prepared for further analysis or modeling. The choice of scaling technique depends on the nature of the data and the specific requirements of the machine learning algorithms being used.

In [9]:
# Identify categorical columns with less than 5 unique categories
categorical_cols_to_encode = [col for col in df_categorical.columns if df_categorical[col].nunique() < 5]

# Apply One-Hot Encoding to these columns
df_categorical_encoded = pd.get_dummies(df_categorical, columns=categorical_cols_to_encode, drop_first=True)

# Columns that are not encoded
categorical_cols_not_encoded = [col for col in df_categorical.columns if col not in categorical_cols_to_encode]

# Combine with the columns that were not encoded
df_categorical_final = pd.concat([df_categorical_encoded, df_categorical[categorical_cols_not_encoded]], axis=1)

In [10]:
# Concatenate numerical and categorical dataframes
df_processed = pd.concat([df_numerical_scaled, df_categorical_final.reset_index(drop=True)], axis=1)

# Display the first few rows of the processed dataset
print(df_processed.head())

        age    fnlwgt  education_num  capital_gain  capital_loss  \
0  0.032932 -1.068581       1.136750      0.154274     -0.218051   
1  0.840440 -1.013622       1.136750     -0.144314     -0.218051   
2 -0.040477  0.241427      -0.426003     -0.144314     -0.218051   
3  1.060669  0.422333      -1.207380     -0.144314     -0.218051   
4 -0.774575  1.405697       1.136750     -0.144314     -0.218051   

   hours_per_week          workclass   education       marital_status  \
0       -0.033763          State-gov   Bachelors        Never-married   
1       -2.233521   Self-emp-not-inc   Bachelors   Married-civ-spouse   
2       -0.033763            Private     HS-grad             Divorced   
3       -0.033763            Private        11th   Married-civ-spouse   
4       -0.033763            Private   Bachelors   Married-civ-spouse   

           occupation  ...  native_country sex_ Male income_ >50K  \
0        Adm-clerical  ...   United-States      True        False   
1     Exec-man

In [11]:
from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Apply Label Encoding to categorical columns with more than 5 categories
for col in df_categorical.columns:
    if df_categorical[col].nunique() > 5:
        df_categorical[col] = label_encoder.fit_transform(df_categorical[col])

# Display the first few rows of the encoded categorical data
print(df_categorical.head())

   workclass  education  marital_status  occupation  relationship    race  \
0          7          9               4           1             1   White   
1          6          9               2           4             0   White   
2          4         11               0           6             1   White   
3          4          1               2           6             0   Black   
4          4          9               2          10             5   Black   

       sex  native_country  income  
0     Male              38   <=50K  
1     Male              38   <=50K  
2     Male              38   <=50K  
3     Male              38   <=50K  
4   Female               5   <=50K  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_categorical[col] = label_encoder.fit_transform(df_categorical[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_categorical[col] = label_encoder.fit_transform(df_categorical[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_categorical[col] = label_encoder.fit_transform(df_categorica

**Pros and Cons of One-Hot Encoding and Label Encoding**

**One-Hot Encoding**

**Pros:**

No Ordinality: Does not assume any ordinal relationship between categories.
Compatibility: Works well with many machine learning algorithms that expect categorical data in a numerical format.
Interpretability: Each category is represented explicitly, making the data easier to interpret.

**Cons:**

High Dimensionality: Can result in a large number of features, especially with high cardinality, which can lead to increased computational cost and potential overfitting.
Sparse Data: Creates sparse matrices, which can be inefficient in terms of memory usage.

**Label Encoding**

**Pros:**

Simplicity: Simple and fast to implement.
Memory Efficient: Uses less memory compared to One-Hot Encoding, especially with high cardinality.

**Cons:**

Ordinal Assumption: Introduces an ordinal relationship between categories, which can be misleading for algorithms that interpret numerical values as having inherent order.
Less Interpretability: Encoded values may not be as interpretable as one-hot encoded values, especially when categories do not have a natural order.


In [12]:
import pandas as pd

# Load the dataset
df = pd.read_csv('adult_with_headers.csv')

# Create interaction term between education-num and hours-per-week
df['education_hours_interaction'] = df['education_num'] * df['hours_per_week']

# Create capital gain to loss ratio
df['capital_gain_loss_ratio'] = df['capital_gain'] / (df['capital_loss'] + 1)  # Adding 1 to avoid division by zero

# Display the first few rows of the dataset to see the new features
print(df.head())

   age          workclass  fnlwgt   education  education_num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        marital_status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital_gain  capital_loss  hours_per_week  native_country  income  \
0        2174.0           0.0            40.0   United-States   <=5

**Explanation of New Features**

**Education and Hours-Per-Week Interaction:**

Rationale: This feature captures the combined effect of education level and working hours, which could be an important predictor of income. Individuals with higher education who work more hours are likely to have higher earnings, while the effect might be different for those with lower education.

**Capital Gain to Loss Ratio:**

Rationale: This ratio helps to understand the net effect of an individual's investments. It normalizes the gains by the losses, providing a clearer picture of financial health. A higher ratio indicates a better financial situation, which could be important for predicting income or other economic outcomes.
By creating these features, we enrich the dataset with more meaningful information that can help machine learning models make better predictions.








In [13]:
# Calculate skewness of numerical features
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
skewness = df[numerical_cols].skew()

# Display skewness
print(skewness)

age                             0.572101
fnlwgt                          1.466295
education_num                  -0.304767
capital_gain                   12.128971
capital_loss                    4.564873
hours_per_week                  0.202396
education_hours_interaction     0.708895
capital_gain_loss_ratio        12.128971
dtype: float64


In [17]:
import pandas as pd

# Load the dataset
df = pd.read_csv('adult_with_headers.csv')

# Display the first few rows of the dataset
print(df.head())

   age          workclass  fnlwgt   education  education_num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        marital_status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital_gain  capital_loss  hours_per_week  native_country  income  
0          2174             0              40   United-States   <=50

In [18]:
from sklearn.ensemble import IsolationForest

# Select numerical features for outlier detection
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
df_numerical = df[numerical_cols]

# Initialize the Isolation Forest model
iso_forest = IsolationForest(contamination=0.05)  # Assuming 5% of the data is outliers

# Fit the model and predict outliers
df['outliers'] = iso_forest.fit_predict(df_numerical)

# -1 indicates outliers, 1 indicates inliers
outliers = df[df['outliers'] == -1]
inliers = df[df['outliers'] == 1]

print(f"Number of outliers detected: {len(outliers)}")
print(f"Number of inliers: {len(inliers)}")



Number of outliers detected: 1628
Number of inliers: 30933


In [19]:
# Remove outliers
df_cleaned = df[df['outliers'] == 1].drop(columns=['outliers'])

# Display the first few rows of the cleaned dataset
print(df_cleaned.head())

   age          workclass  fnlwgt   education  education_num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        marital_status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital_gain  capital_loss  hours_per_week  native_country  income  
0          2174             0              40   United-States   <=50

By using the Isolation Forest algorithm, we can effectively identify and remove outliers from the dataset. This preprocessing step helps in improving the quality of the data, leading to better model performance, more robust predictions, and reduced computational complexity. Detecting and handling outliers is an essential part of data preprocessing to ensure that the machine learning models trained on the data are accurate and reliable.

In [2]:
pip install ppscore



In [5]:
import pandas as pd
import numpy as np
import ppscore as pps
df= pd.read_csv('adult_with_headers.csv')
pps_matrix = pps.matrix(df)



In [None]:
# Calculate Pearson correlation matrix
corr_matrix = df.corr()

# For categorical variables, use appropriate correlation measure
# Example: Cramer's V
from scipy.stats import chi2_contingency
def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x, y)
    chi2 = chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))

# Example usage:
cramers_v_matrix = pd.DataFrame({c1: [cramers_v(df[c1], df[c2]) for c2 in df.columns] for c1 in df.columns}, index=df.columns)

Strengths of PPS: It captures predictive power which traditional correlation measures may miss, providing insights into feature importance for prediction tasks.
Limitations: PPS assumes the relationship between features and the target is monotonic, so it may miss complex relationships or interactions.
By applying both PPS and correlation matrices, you can obtain a more nuanced understanding of feature relationships in your dataset, leveraging both linear and non-linear insights for better model building and feature selection.





