# Importing libraries and reading data

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
# import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
pd.set_option('max_columns',200)

In [13]:
df = pd.read_csv('/kaggle/input/kagglex-bipoc-2022-2023-ml-foundation/Train.csv')

## Step 1: Data Understanding
- Dataframe `shape`
- `head` and `tail`
- `dtypes`
- `describe`

1. make the column names understandable by renaming them and updating the dataframe
2. learn which columns to keep and remove
3. convert object types to categorical or whatever dtype suitable for it

In [28]:
df.shape

(157509, 42)

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.dtypes

In [None]:
df.describe()

In [None]:
df.info()

## renaming columns

In [None]:
df.columns

In [49]:
df.rename(columns = {
#             'ID': , 
            'AAGE': 'age', 
            'ACLSWKR': 'class of worker', 
            'ADTIND': 'industry code', 
            'ADTOCC': 'occupation code', 
            'AHGA':'education', 
            'AHRSPAY':'wage per hour',
            'AHSCOL':'enrolled in edu inst last wk', 
            'AMARITL':'marital status', 
            'AMJIND':'major industry code', 
            'AMJOCC':'major occupation code', 
            'ARACE':'race', 
            'AREORGN':'hispanic origin', 
            'ASEX':'gender',
            'AUNMEM':'member of a labor union', 
            'AUNTYPE':'reason for unemployment', 
            'AWKSTAT':'full or part time employment stat', 
            'CAPGAIN':'capital gains', 
            'CAPLOSS':'capital losses', 
            'DIVVAL':'divdends from stocks',
            'FILESTAT': 'tax filer status', 
            'GRINREG':'region of previous residence', 
            'GRINST':'state of previous residence', 
            'HHDFMX':'detailed household and family stat', 
            'HHDREL':'detailed household summary in household', 
            'MIGMTR1':'migration code-change in msa',
            'MIGMTR3':'migration code-change in reg', 
            'MIGMTR4':'migration code-move within reg', 
            'MIGSAME':'live in this house 1 year ago', 
            'MIGSUN':'migration prev res in sunbelt', 
            'NOEMP':'num persons worked for employer', 
            'PARENT':'family members under 18',
            'PEFNTVTY':'country of birth father', 
            'PEMNTVTY':'country of birth mother', 
            'PENATVTY':'country of birth self', 
            'PRCITSHP':'citizenship', 
            'SEOTR':'own business or self employed', 
            'VETQVA':'fill inc questionnaire for veteran\'s admin',
            'VETYN':'veterans benefits', 
            'WKSWORK':'weeks worked in year', 
#             'YEAR', 
#             'TARGET'    
}, inplace=True)

In [None]:
df.columns

# Step 2: Data Preperation
- Dropping irrelevant columns and rows
- Identifying duplicated columns
- Renaming Columns
- Feature Creation

## Subsetting colums to prepare for training

In [None]:
m = ['ID', 'age', 'class of worker', 'industry code', 'occupation code',
       'education', 'wage per hour', 'enrolled in edu inst last wk',
       'marital status', 'major industry code', 'major occupation code',
       'race', 'hispanic origin', 'gender', 'member of a labor union',
       'reason for unemployment', 'full or part time employment stat',
       'capital gains', 'capital losses', 'divdends from stocks',
       'tax filer status', 'region of previous residence',
       'state of previous residence', 'detailed household and family stat',
       'detailed household summary in household',
       'migration code-change in msa', 'migration code-change in reg',
       'migration code-move within reg', 'live in this house 1 year ago',
       'migration prev res in sunbelt', 'num persons worked for employer',
       'family members under 18', 'country of birth father',
       'country of birth mother', 'country of birth self', 'citizenship',
       'own business or self employed',
       'fill inc questionnaire for veteran\'s admin', 'veterans benefits',
       'weeks worked in year', 'YEAR', 'TARGET']
for i in m:
    print(i+'\n')
    print(df[i].unique())
    print('\n'+'*'*20+'\n')
    print(df[i].value_counts())
    print('\n'+'*'*20+'\n')

In [None]:
df = df[['ID', 'age', 'class of worker', 'industry code', 'occupation code',
       'education', 'wage per hour', 'enrolled in edu inst last wk',
       'marital status', 'major industry code', 'major occupation code',
       'race', 'hispanic origin', 'gender', 'member of a labor union',
       'reason for unemployment', 'full or part time employment stat',
       'capital gains', 'capital losses', 'divdends from stocks',
       'tax filer status', 'region of previous residence',
       'state of previous residence', 'detailed household and family stat',
       'detailed household summary in household',
       'migration code-change in msa', 'migration code-change in reg',
       'migration code-move within reg', 'live in this house 1 year ago',
       'migration prev res in sunbelt', 'num persons worked for employer',
       'family members under 18', 'country of birth father',
       'country of birth mother', 'country of birth self', 'citizenship',
       'own business or self employed',
       'fill inc questionnaire for veteran\'s admin', 'veterans benefits',
       'weeks worked in year', 'YEAR', 'TARGET']].copy()

In [69]:
# Example of dropping columns
# df.drop(['YEAR'], axis=1)

In [None]:
df.isna().sum()

In [83]:
df.TARGET.value_counts().index

Int64Index([0, 1], dtype='int64')

In [86]:
df.TARGET.value_counts().values

array([144529,  12980])

In [84]:
df.TARGET.value_counts().values[1]/df.TARGET.value_counts().values[0]

0.08980896567470888

# Step 3: Feature Understanding
(Univariate analysis)

- Plotting Feature Distributions
    - Histogram
    - KDE
    - Boxplot

# Step 4: Feature Relationships
- Scatterplot
- Heatmap Correlation
- Pairplot
- Groupby comparisons

# Scratch codes

In [87]:
test = pd.read_csv('/kaggle/input/kagglex-bipoc-2022-2023-ml-foundation/Test.csv')

In [88]:
test.head()

Unnamed: 0,ID,AAGE,ACLSWKR,ADTIND,ADTOCC,AHGA,AHRSPAY,AHSCOL,AMARITL,AMJIND,AMJOCC,ARACE,AREORGN,ASEX,AUNMEM,AUNTYPE,AWKSTAT,CAPGAIN,CAPLOSS,DIVVAL,FILESTAT,GRINREG,GRINST,HHDFMX,HHDREL,MIGMTR1,MIGMTR3,MIGMTR4,MIGSAME,MIGSUN,NOEMP,PARENT,PEFNTVTY,PEMNTVTY,PENATVTY,PRCITSHP,SEOTR,VETQVA,VETYN,WKSWORK,YEAR
0,ai1kagv30p8v,42,State government,41,3,Some college but no degree,0,Not in universe,Married-civilian spouse present,Hospital services,Executive admin and managerial,White,All other,Female,Not in universe,Not in universe,Children or Armed Forces,0,0,0,Joint both under 65,Not in universe,Not in universe,Spouse of householder,Spouse of householder,Nonmover,Nonmover,Nonmover,Yes,Not in universe,4,Not in universe,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,52,1994
1,9s9e3x6a8f7u,45,Local government,43,26,Some college but no degree,0,Not in universe,Divorced,Education,Adm support including clerical,Black,All other,Female,No,Not in universe,Children or Armed Forces,0,0,0,Head of household,South,Alabama,Householder,Householder,NonMSA to nonMSA,Same county,Same county,No,Yes,5,Not in universe,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,38,1994
2,qlvd7mszxd2z,47,Private,12,37,5th or 6th grade,0,Not in universe,Married-civilian spouse present,Manufacturing-durable goods,Machine operators assmblrs & inspctrs,White,Puerto Rican,Female,Not in universe,Not in universe,Full-time schedules,0,0,0,Joint both under 65,Not in universe,Not in universe,Householder,Householder,,,,Not in universe under 1 year old,,3,Not in universe,Puerto-Rico,Puerto-Rico,Puerto-Rico,Native- Born in Puerto Rico or U S Outlying,0,Not in universe,2,44,1995
3,uwhbqcnx5a5z,62,Not in universe,0,0,7th and 8th grade,0,Not in universe,Married-civilian spouse present,Not in universe or children,Not in universe,White,All other,Female,Not in universe,Not in universe,Children or Armed Forces,0,0,3000,Joint both under 65,Not in universe,Not in universe,Spouse of householder,Spouse of householder,Nonmover,Nonmover,Nonmover,Yes,Not in universe,0,Not in universe,Italy,Italy,Italy,Foreign born- U S citizen by naturalization,0,Not in universe,2,0,1994
4,27c5sqbrzdwf,63,Private,45,2,Masters degree(MA MS MEng MEd MSW MBA),0,Not in universe,Married-civilian spouse present,Other professional services,Executive admin and managerial,White,All other,Male,Not in universe,Not in universe,Children or Armed Forces,0,0,300,Joint both under 65,Not in universe,Not in universe,Spouse of householder,Spouse of householder,Nonmover,Nonmover,Nonmover,Yes,Not in universe,2,Not in universe,Greece,United-States,United-States,Native- Born in the United States,0,Not in universe,2,52,1994


output.to_csv('./solution.csv', index=False)