In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('learningSet.csv')

In [3]:
df.shape

(95412, 481)

In [4]:
df.head(3)

Unnamed: 0,ODATEDW,OSOURCE,TCODE,STATE,ZIP,MAILCODE,PVASTATE,DOB,NOEXCH,RECINHSE,...,TARGET_D,HPHONE_D,RFA_2R,RFA_2F,RFA_2A,MDMAUD_R,MDMAUD_F,MDMAUD_A,CLUSTER2,GEOCODE2
0,8901,GRI,0,IL,61081,,,3712,0,,...,0.0,0,L,4,E,X,X,X,39.0,C
1,9401,BOA,1,CA,91326,,,5202,0,,...,0.0,0,L,2,G,X,X,X,1.0,A
2,9001,AMH,1,NC,27017,,,0,0,,...,0.0,1,L,4,E,X,X,X,60.0,C


# Data Cleaning

- create a list to collect features that we want want to drop

In [5]:
drop_list = []

## Null Values Check

In [6]:
null_check = df.isna().sum() / df.shape[0] * 100

In [7]:
print('features with null values: ', len(null_check[null_check > 0]))

features with null values:  92


In [8]:
print('Features with null values > 80%: ', len(null_check[null_check > 80.]))
null_check[null_check > 80.]

Features with null values > 80%:  33


NUMCHLD     87.018404
RDATE_3     99.746363
RDATE_4     99.705488
RDATE_5     99.990567
RDATE_6     99.186685
RDATE_7     90.677273
RDATE_9     82.461326
RDATE_10    89.035970
RDATE_11    84.551209
RDATE_13    87.160944
RDATE_15    92.388798
RDATE_17    90.146942
RDATE_19    83.359535
RDATE_20    91.732696
RDATE_21    90.029556
RDATE_23    91.763091
RDATE_24    81.409047
RAMNT_3     99.746363
RAMNT_4     99.705488
RAMNT_5     99.990567
RAMNT_6     99.186685
RAMNT_7     90.677273
RAMNT_9     82.461326
RAMNT_10    89.035970
RAMNT_11    84.551209
RAMNT_13    87.160944
RAMNT_15    92.388798
RAMNT_17    90.146942
RAMNT_19    83.359535
RAMNT_20    91.732696
RAMNT_21    90.029556
RAMNT_23    91.763091
RAMNT_24    81.409047
dtype: float64

- add the features with more than 80% null values to drop list

In [9]:
drop_list.extend(null_check[null_check > 80.].index)

## Check selected categorical features

### OSOURCE and ZIP

In [10]:
df['OSOURCE'].head(3)

0    GRI
1    BOA
2    AMH
Name: OSOURCE, dtype: object

In [11]:
print('Amount of unique values in OSOURCE: ', len(df['OSOURCE'].unique()))

Amount of unique values in OSOURCE:  896


### Enrich ZIP data with longitude and latitude 
- could be used later to make sure that deographical data is included in model

In [12]:
df_geo = pd.read_excel('US_zip_lon_lat.xlsx')

In [13]:
df_geo.shape

(41470, 14)

In [14]:
df_geo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41470 entries, 0 to 41469
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   country code            41470 non-null  object 
 1   postal code             41470 non-null  int64  
 2   place                   41470 non-null  object 
 3   state                   40957 non-null  object 
 4   statecode               40959 non-null  object 
 5   province_or_county      40959 non-null  object 
 6   province_or_countycode  40959 non-null  float64
 7   community               1 non-null      object 
 8   communitycode           1 non-null      float64
 9   latitude                41470 non-null  object 
 10  longitude               41470 non-null  object 
 11  accuracy                40938 non-null  float64
 12  Country                 41470 non-null  object 
 13  Continent               41470 non-null  object 
dtypes: float64(3), int64(1), object(10)
me

In [15]:
df['ZIP'].head(3)

0    61081
1    91326
2    27017
Name: ZIP, dtype: object

In [16]:
print('Amount of unique values in ZIP: ', len(df['OSOURCE'].unique()))

Amount of unique values in ZIP:  896


In [17]:
df_zip = pd.DataFrame(df['ZIP'])
df_zip['lon'] = 0
df_zip['lat'] = 0

In [18]:
for i, row in df_zip.iterrows():
    zip_clean = row['ZIP'].replace('-', '')
    geo_data = df_geo[df_geo['postal code'] == int(zip_clean)]
    if len(geo_data):
        df_zip.at[i,'lon'] = list(geo_data['longitude'])[0]
        df_zip.at[i,'lat'] = list(geo_data['latitude'])[0]
    else: 
        df_zip.at[i,'lon'] = np.nan
        df_zip.at[i,'lat'] = np.nan

In [19]:
df_zip.isna().sum()

ZIP      0
lon    301
lat    301
dtype: int64

- add OSOURCE and ZIP to drop list

In [20]:
drop_list.extend(['OSOURCE', 'ZIP'])

### GENDER

In [21]:
df['GENDER'].head(3)

0    F
1    M
2    M
Name: GENDER, dtype: object

In [22]:
print('Amount of unique values in GENDER: ', len(df['GENDER'].unique()))
print(df['GENDER'].unique())

Amount of unique values in GENDER:  7
['F' 'M' ' ' 'C' 'U' 'J' 'A']


In [23]:
df['GENDER'].value_counts()

F    51277
M    39094
      2957
U     1715
J      365
C        2
A        2
Name: GENDER, dtype: int64

- In GENDER we see that the most frequent category is F
- we will replace the empty / non values with this 

In [24]:
df['GENDER'].replace(r'^\s+$',  df['GENDER'].mode()[0], regex=True, inplace=True)

In [25]:
df['GENDER'] = ['other' if g != 'F' and g != 'M' else g for g in df['GENDER'] ];

In [26]:
df['GENDER'].value_counts()

F        54234
M        39094
other     2084
Name: GENDER, dtype: int64