# Data Importing and Data Preparation

In [182]:
# Dependencies and setup
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import csv
from datetime import date
from uszipcode import SearchEngine

In [201]:
# defining data path
path=Path('Resources/Raw_data/testData.csv')

In [202]:
# Reading data source using pandas
rent_df=pd.read_csv(path)

ParserError: Error tokenizing data. C error: Expected 7 fields in line 172, saw 8


### Getting an error with 7 columns name but some rows have 8 columns. 
### Using CSV reader to get the columns name and handaling this error so that can easily read with pandas

In [203]:
# Reading a csv file using CSV module for handaling error
with open(path, 'r') as f:
    csv_reader = csv.reader(f, delimiter=',')
    columns = next(csv_reader)
    print(columns)


['name', 'dob', 'houseID', 'houseZip', 'paymentDate', 'paymentAmount', 'rentAmount']


In [204]:
# Defining the columns name(adding new column name "extra")
names=columns+["extra"]

In [205]:
# Reading data source using pandas read_csv
rent_df=pd.read_csv(path, skiprows=1,names=names)

## Data Understanding

In [206]:
#Looking top rows of rent_df dataframe
rent_df.head()

Unnamed: 0,name,dob,houseID,houseZip,paymentDate,paymentAmount,rentAmount,extra
0,Karima Germany,05/23/1951,1192,92154,11/01/2011,1321,1321.0,
1,Agustina Spargo,01/01/1900,21,92111,20110906,2289,2289.0,
2,Lucilla Broderick,01/01/1900,1474,92159,20111101,1439,1439.0,
3,Russ Mchale,04/20/1977,2015,92137,20120701,1744,1744.0,
4,Carmelita Ritzer,03/09/1969,311,92136,20110201,1471,1471.0,


In [207]:
# Columns of rent_df dataframe
rent_df.columns

Index(['name', 'dob', 'houseID', 'houseZip', 'paymentDate', 'paymentAmount',
       'rentAmount', 'extra'],
      dtype='object')

In [208]:
# Shape of rent_df datafrmae
shape=rent_df.shape
print(f'The number of rows are: {shape[0]}, number of columns are: {shape[1]}')

The number of rows are: 59813, number of columns are: 8


In [209]:
#Information of the dataframe using .info()
rent_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59813 entries, 0 to 59812
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           56903 non-null  object 
 1   dob            59813 non-null  object 
 2   houseID        59813 non-null  object 
 3   houseZip       59813 non-null  int64  
 4   paymentDate    59813 non-null  object 
 5   paymentAmount  59281 non-null  object 
 6   rentAmount     59775 non-null  float64
 7   extra          2910 non-null   float64
dtypes: float64(2), int64(1), object(5)
memory usage: 3.7+ MB


In [210]:
#looking for nan value
rent_df.isna().sum()

name              2910
dob                  0
houseID              0
houseZip             0
paymentDate          0
paymentAmount      532
rentAmount          38
extra            56903
dtype: int64

In [211]:
#filling NAN with 0 in extra column
rent_df['extra']=rent_df['extra'].fillna(0)
rent_df

Unnamed: 0,name,dob,houseID,houseZip,paymentDate,paymentAmount,rentAmount,extra
0,Karima Germany,05/23/1951,1192,92154,11/01/2011,1321,1321.0,0.0
1,Agustina Spargo,01/01/1900,21,92111,20110906,2289,2289.0,0.0
2,Lucilla Broderick,01/01/1900,1474,92159,20111101,1439,1439.0,0.0
3,Russ Mchale,04/20/1977,2015,92137,20120701,1744,1744.0,0.0
4,Carmelita Ritzer,03/09/1969,311,92136,20110201,1471,1471.0,0.0
...,...,...,...,...,...,...,...,...
59808,Jennell Buchholtz,05/21/1983,1095,92196,20101001,1744,1744.0,0.0
59809,Berneice Power,08/24/1984,196,92123,20100401,1756,1756.0,0.0
59810,Micki Belvin,01/01/1900,1692,92166,20120701,1896,1896.0,0.0
59811,Emilee Samford,01/01/1900,686,92191,20120528,1668,1451.0,0.0


## Correcting Columns Shift

## Since some rows are shifted one column up, correcting this error using shift method.

In [212]:
#Get all rows by mask
mask = rent_df['extra'] != 0.0
#All columns name
c = list(rent_df.columns)
#Shift columns, by converting to strings
rent_df.loc[mask, c] = rent_df.loc[mask, c].astype(str).shift(-1, axis=1)

In [213]:
#Dropping the extra column after shifting rows correctly
rent_df.drop(columns="extra", axis=1, inplace=True)

In [214]:
#Looking top rows of  corrected rent_df dataframe
rent_df.head()

Unnamed: 0,name,dob,houseID,houseZip,paymentDate,paymentAmount,rentAmount
0,Karima Germany,05/23/1951,1192,92154,11/01/2011,1321,1321.0
1,Agustina Spargo,01/01/1900,21,92111,20110906,2289,2289.0
2,Lucilla Broderick,01/01/1900,1474,92159,20111101,1439,1439.0
3,Russ Mchale,04/20/1977,2015,92137,20120701,1744,1744.0
4,Carmelita Ritzer,03/09/1969,311,92136,20110201,1471,1471.0


## Converting Columns to Appropriate Datatypes

In [215]:
#Converting column to appropriate datatypes
rent_df['dob']=pd.to_datetime(rent_df['dob']).dt.date
rent_df['paymentDate']=pd.to_datetime(rent_df['paymentDate']).dt.date
rent_df['houseID']=rent_df['houseID'].astype('int')
rent_df['paymentAmount']=rent_df['paymentAmount'].astype('float')
rent_df['rentAmount']=rent_df['rentAmount'].astype('float')

## Adding Age Columns Using Date of Birth Column

In [216]:
#Adding the age columns using date of birth
rent_df["age"] = rent_df["dob"].apply(lambda x : (date.today().year - x.year))

In [217]:
# Shape of rent_df datafrmae
shape=rent_df.shape
print(f'The number of rows are: {shape[0]}, number of columns are: {shape[1]}')

The number of rows are: 59813, number of columns are: 8


In [218]:
# looking for statistical terms(transpose for better visulization)
rent_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
houseID,59813.0,1236.777105,715.827165,1.0,610.0,1240.0,1860.0,2475.0
paymentAmount,59243.0,1511.771461,360.332519,-348.0,1318.0,1522.0,1739.0,2861.0
rentAmount,59813.0,1505.65133,303.685935,428.0,1310.0,1506.0,1720.0,2647.0
age,59813.0,82.674268,40.686114,-8.0,43.0,85.0,122.0,122.0


# Taking Care of Missing Data

In [219]:
#looking for nan value
rent_df.isna().sum()

name               0
dob                0
houseID            0
houseZip           0
paymentDate        0
paymentAmount    570
rentAmount         0
age                0
dtype: int64

In [220]:
#Info of the corrected dataframe
rent_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59813 entries, 0 to 59812
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           59813 non-null  object 
 1   dob            59813 non-null  object 
 2   houseID        59813 non-null  int64  
 3   houseZip       59813 non-null  object 
 4   paymentDate    59813 non-null  object 
 5   paymentAmount  59243 non-null  float64
 6   rentAmount     59813 non-null  float64
 7   age            59813 non-null  int64  
dtypes: float64(2), int64(2), object(4)
memory usage: 3.7+ MB


In [221]:
#filling NAN with 0 in paymentAmount column
rent_df['paymentAmount']=rent_df['paymentAmount'].fillna(0)

### Missing date of birth is with 1900-01-01. Need to remove missing date of birth for visulization or model building.

In [222]:
# Looking for rows with date of birth 1900-01-01
rent_df.loc[rent_df["dob"].apply(lambda x : x.year)==1900, ['dob', 'age']]

Unnamed: 0,dob,age
1,1900-01-01,122
2,1900-01-01,122
6,1900-01-01,122
7,1900-01-01,122
12,1900-01-01,122
...,...,...
59800,1900-01-01,122
59802,1900-01-01,122
59806,1900-01-01,122
59810,1900-01-01,122


In [230]:
# Age above 1 and below 110
rent_df.loc[(rent_df['age']<1) | (rent_df['age']>110), ['name', 'dob','age']]


Unnamed: 0,name,dob,age
1,Agustina Spargo,1900-01-01,122
2,Lucilla Broderick,1900-01-01,122
6,Theda Howard,1900-01-01,122
7,Delmar Facey,1900-01-01,122
12,Santo Hanney,1900-01-01,122
...,...,...,...
59800,Leola Derrickson,1900-01-01,122
59802,Cherise Chaney,1900-01-01,122
59806,Millard Woodford,1900-01-01,122
59810,Micki Belvin,1900-01-01,122


#### 29933 rows are missing date of birth
#### Will remove the missing date from date of birth column later while visulization and model building.

## Saving the Cleaned Data for Visualization and Model Building

In [40]:
# Cleaned dataframe
rent_df.head(20)

Unnamed: 0,name,dob,houseID,houseZip,paymentDate,paymentAmount,rentAmount,age
0,Karima Germany,1951-05-23,1192,92154,2011-11-01,1321.0,1321.0,71
1,Agustina Spargo,1900-01-01,21,92111,2011-09-06,2289.0,2289.0,122
2,Lucilla Broderick,1900-01-01,1474,92159,2011-11-01,1439.0,1439.0,122
3,Russ Mchale,1977-04-20,2015,92137,2012-07-01,1744.0,1744.0,45
4,Carmelita Ritzer,1969-03-09,311,92136,2011-02-01,1471.0,1471.0,53
5,Clifton Ellwood,1993-11-02,430,92103,2011-11-01,1233.0,1233.0,29
6,Theda Howard,1900-01-01,2260,92161,2011-09-01,1850.0,1850.0,122
7,Delmar Facey,1900-01-01,541,92161,2011-12-01,1587.0,1587.0,122
8,Lashawn Rotella,1985-12-05,1336,92190,2011-07-01,1930.0,1930.0,37
9,Tianna Greenwell,1983-01-14,2273,92190,2010-05-01,1597.0,1597.0,39


In [52]:
#saving the cleaned data 
rent_df.to_csv('Resources/Cleaned_data/cleaned_rent_data.csv', index=False)

In [181]:
# Saving as html for website table
rent_df.to_html("Resources/Cleaned_data/rent_data.html")

# Feature Selection and Feature Engineering
### In this section, I added more columns using the house zipcode provided. Used SearchEngine module to add following columns:
1. Lattitude
2. Longitude
3. City
4. State
5. County
6. Median Income
7. Population
8. Housing Units 
9. Occupied Housing Units

In [231]:
# Making copy of the rent_df dataframe
model_df=rent_df.copy()

In [232]:
model_df.head()

Unnamed: 0,name,dob,houseID,houseZip,paymentDate,paymentAmount,rentAmount,age
0,Karima Germany,1951-05-23,1192,92154,2011-11-01,1321.0,1321.0,71
1,Agustina Spargo,1900-01-01,21,92111,2011-09-06,2289.0,2289.0,122
2,Lucilla Broderick,1900-01-01,1474,92159,2011-11-01,1439.0,1439.0,122
3,Russ Mchale,1977-04-20,2015,92137,2012-07-01,1744.0,1744.0,45
4,Carmelita Ritzer,1969-03-09,311,92136,2011-02-01,1471.0,1471.0,53


In [235]:
# Using SearchEngine module to get the information using the zipcode
search = SearchEngine()
result = search.by_zipcode('92159')
result

SimpleZipcode(zipcode='92159', zipcode_type='PO Box', major_city='San Diego', post_office_city=None, common_city_list=['San Diego'], county='San Diego County', state='CA', lat=None, lng=None, timezone=None, radius_in_miles=None, area_code_list=['619'], population=None, population_density=None, land_area_in_sqmi=None, water_area_in_sqmi=None, housing_units=None, occupied_housing_units=None, median_home_value=None, median_household_income=None, bounds_west=None, bounds_east=None, bounds_north=None, bounds_south=None)

## Adding New Columns to the Data

In [61]:
#Adding city, county, lat, lng, housing units columns  to dataframe using SearchEngine module
model_df["city"]=''
model_df["county"]=''
model_df['lat']=''
model_df['lng']=''
model_df['housingUnits']=''
model_df['occupiedHousingUnits']=''
model_df['medianIncome']=''
model_df['population']=''
model_df['state']=''
for index, row in rent_df.iterrows():

    # Get zipcode from dataframe
    zipcode = str(row['houseZip'])
    #Searching by zipcode
    search = SearchEngine()
    result = search.by_zipcode(zipcode)
    #Adding columns
    try:
        model_df.loc[index, 'city'] = result.city
        model_df.loc[index, 'county'] = result.county
        model_df.loc[index, 'lat'] = result.lat
        model_df.loc[index, 'lng'] = result.lng
        model_df.loc[index, 'housingUnits'] = result.housing_units
        model_df.loc[index, 'occupiedHousingUnits'] = result.occupied_housing_units
        model_df.loc[index, 'medianIncome'] = result.median_household_income
        model_df.loc[index, 'population'] = result.population
        model_df.loc[index, 'state'] = result.state
    except (KeyError, IndexError):
        print("Missing field/result for... skipping.")

In [62]:
#Saving data for later use in modeling and visulization
model_df.to_csv('Resources/Cleaned_data/clean_added_data.csv', index=False)

## Cleaning Data for Machine Learning

In [241]:
# Reading the recently saved data with new columns 
new_df=pd.read_csv('Resources/Cleaned_data/clean_added_data.csv')

In [242]:
#Recently saved data with new columns 
new_df.head()

Unnamed: 0,name,dob,houseID,houseZip,paymentDate,paymentAmount,rentAmount,age,city,county,lat,lng,housingUnits,occupiedHousingUnits,medianIncome,population,state
0,Karima Germany,1951-05-23,1192,92154,2011-11-01,1321.0,1321.0,71,San Diego,San Diego County,32.6,-117.0,21206.0,20202.0,57618.0,79708.0,CA
1,Agustina Spargo,1900-01-01,21,92111,2011-09-06,2289.0,2289.0,122,San Diego,San Diego County,32.82,-117.15,17268.0,16498.0,57350.0,45096.0,CA
2,Lucilla Broderick,1900-01-01,1474,92159,2011-11-01,1439.0,1439.0,122,San Diego,San Diego County,,,,,,,CA
3,Russ Mchale,1977-04-20,2015,92137,2012-07-01,1744.0,1744.0,45,San Diego,San Diego County,,,,,,,CA
4,Carmelita Ritzer,1969-03-09,311,92136,2011-02-01,1471.0,1471.0,53,San Diego,San Diego County,,,,,,,CA


In [243]:
#Info of the new added columns dataframe
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59813 entries, 0 to 59812
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   name                  59813 non-null  object 
 1   dob                   59813 non-null  object 
 2   houseID               59813 non-null  int64  
 3   houseZip              59813 non-null  int64  
 4   paymentDate           59813 non-null  object 
 5   paymentAmount         59813 non-null  float64
 6   rentAmount            59813 non-null  float64
 7   age                   59813 non-null  int64  
 8   city                  59813 non-null  object 
 9   county                59813 non-null  object 
 10  lat                   28178 non-null  float64
 11  lng                   28178 non-null  float64
 12  housingUnits          28178 non-null  float64
 13  occupiedHousingUnits  28178 non-null  float64
 14  medianIncome          23903 non-null  float64
 15  population         

In [244]:
#looking for nan value
new_df.isna().sum()

name                        0
dob                         0
houseID                     0
houseZip                    0
paymentDate                 0
paymentAmount               0
rentAmount                  0
age                         0
city                        0
county                      0
lat                     31635
lng                     31635
housingUnits            31635
occupiedHousingUnits    31635
medianIncome            35910
population              31635
state                       0
dtype: int64

### Recetly added columns from zipcode has missing information about lattitude, lngitude and other information which need to take care.  

## Converting PaymentDate to Binary
### Defining a function to convert to 1 0r 0, if payment date is before or after 10th of month.


In [245]:
new_df['paymentDate']=pd.to_datetime(new_df['paymentDate']).dt.date

In [246]:
# Defining a function to convert to 1 0r 0 if payment date is before or after 10 of month
def payment(x):
    if x.day>10:
        return 0
    else:
        return 1

In [247]:
# Applying the function
new_df['paymentDate']=new_df['paymentDate'].apply(payment)

In [248]:
# paymentDate columns
new_df['paymentDate'].value_counts()

1    55580
0     4233
Name: paymentDate, dtype: int64

## Adding New Target Columns 

### Target column with high or low risk based on the overpayment and missing payment.

In [249]:
#Defining a function to convert to low risk and high risk
def low_high_risk(x,y):
    if (x==0.0 or x>y):
        return "high risk"
    else:
        return "low risk"

In [250]:
# Adding the target column status with low and high risk
new_df['status']=new_df.apply(lambda x: low_high_risk(x['paymentAmount'], x['rentAmount']), axis=1)

In [251]:
new_df.head()

Unnamed: 0,name,dob,houseID,houseZip,paymentDate,paymentAmount,rentAmount,age,city,county,lat,lng,housingUnits,occupiedHousingUnits,medianIncome,population,state,status
0,Karima Germany,1951-05-23,1192,92154,1,1321.0,1321.0,71,San Diego,San Diego County,32.6,-117.0,21206.0,20202.0,57618.0,79708.0,CA,low risk
1,Agustina Spargo,1900-01-01,21,92111,1,2289.0,2289.0,122,San Diego,San Diego County,32.82,-117.15,17268.0,16498.0,57350.0,45096.0,CA,low risk
2,Lucilla Broderick,1900-01-01,1474,92159,1,1439.0,1439.0,122,San Diego,San Diego County,,,,,,,CA,low risk
3,Russ Mchale,1977-04-20,2015,92137,1,1744.0,1744.0,45,San Diego,San Diego County,,,,,,,CA,low risk
4,Carmelita Ritzer,1969-03-09,311,92136,1,1471.0,1471.0,53,San Diego,San Diego County,,,,,,,CA,low risk


In [252]:
# paymentDate columns
new_df['status'].value_counts()

low risk     52640
high risk     7173
Name: status, dtype: int64

## Making Two Different Data Sets for Model Building:
#### 1. Droping Columns with Missing Value
#### 2. Dropping Rows with Missing any Value( Half of the data will be reduced using this method)

In [168]:
#Droping columns and rows with missing value
ml_df_col=new_df.drop(columns=['lat','lng', 'housingUnits',
                                  'occupiedHousingUnits','medianIncome', 'population'], axis=1)
ml_df_row=new_df.dropna(how="any")

In [180]:
# Data sets to store for MongoDB for website building
ml_df_row.to_csv('Resources/Cleaned_data/mongodb.csv', index=False)

# Under Sampling and Over Sampling for Model Building
### Since the data has very small number of low risk data, I used over sampling and under sampling to take care of it.

In [169]:
#Only considering low risk and high risk rows for column nan droped dataframe
low_risk_rows_col = ml_df_col[ml_df_col['status'] == 'low risk']
high_risk_rows_col = ml_df_col[ml_df_col['status'] == 'high risk']
low_risk_rows_col

Unnamed: 0,name,dob,houseID,houseZip,paymentDate,paymentAmount,rentAmount,age,city,county,state,status
0,Karima Germany,1951-05-23,1192,92154,1,1321.0,1321.0,71,San Diego,San Diego County,CA,low risk
1,Agustina Spargo,1900-01-01,21,92111,1,2289.0,2289.0,122,San Diego,San Diego County,CA,low risk
2,Lucilla Broderick,1900-01-01,1474,92159,1,1439.0,1439.0,122,San Diego,San Diego County,CA,low risk
3,Russ Mchale,1977-04-20,2015,92137,1,1744.0,1744.0,45,San Diego,San Diego County,CA,low risk
4,Carmelita Ritzer,1969-03-09,311,92136,1,1471.0,1471.0,53,San Diego,San Diego County,CA,low risk
...,...,...,...,...,...,...,...,...,...,...,...,...
59807,Marylouise Mott,1994-03-21,1633,92159,1,1342.0,1342.0,28,San Diego,San Diego County,CA,low risk
59808,Jennell Buchholtz,1983-05-21,1095,92196,1,1744.0,1744.0,39,San Diego,San Diego County,CA,low risk
59809,Berneice Power,1984-08-24,196,92123,1,1756.0,1756.0,38,San Diego,San Diego County,CA,low risk
59810,Micki Belvin,1900-01-01,1692,92166,1,1896.0,1896.0,122,San Diego,San Diego County,CA,low risk


In [170]:
#Only considering low risk and high risk rows for rows nan droped dataframe
low_risk_rows = ml_df_row[ml_df_row['status'] == 'low risk']
high_risk_rows = ml_df_row[ml_df_row['status'] == 'high risk']
low_risk_rows

Unnamed: 0,name,dob,houseID,houseZip,paymentDate,paymentAmount,rentAmount,age,city,county,lat,lng,housingUnits,occupiedHousingUnits,medianIncome,population,state,status
0,Karima Germany,1951-05-23,1192,92154,1,1321.0,1321.0,71,San Diego,San Diego County,32.60,-117.00,21206.0,20202.0,57618.0,79708.0,CA,low risk
1,Agustina Spargo,1900-01-01,21,92111,1,2289.0,2289.0,122,San Diego,San Diego County,32.82,-117.15,17268.0,16498.0,57350.0,45096.0,CA,low risk
5,Clifton Ellwood,1993-11-02,430,92103,1,1233.0,1233.0,29,San Diego,San Diego County,32.75,-117.17,19080.0,17827.0,62092.0,31066.0,CA,low risk
11,Taneka Noto,1992-05-09,2216,92110,1,988.0,988.0,30,San Diego,San Diego County,32.77,-117.21,11426.0,10630.0,59719.0,25341.0,CA,low risk
14,Curtis Sher,1996-10-14,463,92130,1,1169.0,1169.0,26,San Diego,San Diego County,32.95,-117.20,18272.0,17528.0,127968.0,48940.0,CA,low risk
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59803,Anisa Gammill,1981-12-15,104,92116,1,1524.0,1524.0,41,San Diego,San Diego County,32.76,-117.13,17122.0,16138.0,53448.0,31680.0,CA,low risk
59805,Freddie Bridger,1965-03-06,804,92145,1,1142.0,1142.0,57,San Diego,San Diego County,32.87,-117.14,407.0,390.0,226875.0,1449.0,CA,low risk
59806,Millard Woodford,1900-01-01,1704,92129,1,-159.9,1066.0,122,San Diego,San Diego County,32.96,-117.13,17439.0,17009.0,107870.0,51536.0,CA,low risk
59809,Berneice Power,1984-08-24,196,92123,1,1756.0,1756.0,38,San Diego,San Diego County,32.81,-117.14,10760.0,10039.0,71412.0,26823.0,CA,low risk


## Under Sampling Technique

In [171]:
#Under sampling for rows nan droped dataframe
ml_df_under_row = pd.concat([low_risk_rows.sample(n=len(high_risk_rows), random_state=42), high_risk_rows])
ml_df_under_row = ml_df_under_row.reset_index(drop=True)
ml_df_under_row

Unnamed: 0,name,dob,houseID,houseZip,paymentDate,paymentAmount,rentAmount,age,city,county,lat,lng,housingUnits,occupiedHousingUnits,medianIncome,population,state,status
0,Denny Paschall,1900-01-01,561,92130,1,1077.0,1077.0,122,San Diego,San Diego County,32.95,-117.20,18272.0,17528.0,127968.0,48940.0,CA,low risk
1,Vita Lizaola,1976-10-21,2391,92105,1,1125.0,1125.0,46,San Diego,San Diego County,32.74,-117.09,21744.0,20540.0,37534.0,69813.0,CA,low risk
2,Lanette Preston,1900-01-01,1318,92102,1,1176.0,1176.0,122,San Diego,San Diego County,32.72,-117.12,14987.0,13981.0,40557.0,43267.0,CA,low risk
3,Janna Lautenschlage,1900-01-01,1048,92108,1,1489.0,1489.0,122,San Diego,San Diego County,32.77,-117.15,11601.0,10463.0,63098.0,18858.0,CA,low risk
4,Rebekah Weatherly,1900-01-01,1927,92139,1,1406.0,1406.0,122,San Diego,San Diego County,32.68,-117.05,10629.0,10216.0,58079.0,35125.0,CA,low risk
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5875,Aurora Mccarley,1970-09-02,168,92154,0,1684.0,1531.0,52,San Diego,San Diego County,32.60,-117.00,21206.0,20202.0,57618.0,79708.0,CA,high risk
5876,Paris Teston,1900-01-01,1861,92124,0,2222.0,1778.0,122,San Diego,San Diego County,32.82,-117.07,11445.0,11038.0,80463.0,30443.0,CA,high risk
5877,Jazmine Rudisill,1972-09-21,1074,92105,0,2091.0,1901.0,50,San Diego,San Diego County,32.74,-117.09,21744.0,20540.0,37534.0,69813.0,CA,high risk
5878,Jayme Hidalgo,1991-10-11,1064,92128,1,2058.0,1871.0,31,San Diego,San Diego County,33.00,-117.07,21265.0,20263.0,92531.0,47490.0,CA,high risk


In [172]:
#Under sampling for columns nan droped dataframe
ml_df_under_col = pd.concat([low_risk_rows_col.sample(n=len(high_risk_rows_col), random_state=42), high_risk_rows_col])
ml_df_under_col = ml_df_under_col.reset_index(drop=True)
ml_df_under_col

Unnamed: 0,name,dob,houseID,houseZip,paymentDate,paymentAmount,rentAmount,age,city,county,state,status
0,Isaiah Clayton,1900-01-01,985,92166,1,1854.0,1854.0,122,San Diego,San Diego County,CA,low risk
1,Harland Liggins,1983-11-28,910,92194,1,1978.0,1978.0,39,San Diego,San Diego County,CA,low risk
2,Almeda Sanders,1900-01-01,63,92113,1,1074.0,1074.0,122,San Diego,San Diego County,CA,low risk
3,Tiffany Demaris,2011-02-12,2275,92194,1,1587.0,1587.0,11,San Diego,San Diego County,CA,low risk
4,Leo Moorhouse,1900-01-01,1392,92147,1,1132.0,1132.0,122,San Diego,San Diego County,CA,low risk
...,...,...,...,...,...,...,...,...,...,...,...,...
14341,Marita Iannuzzi,1900-01-01,1600,92124,0,1969.0,1713.0,122,San Diego,San Diego County,CA,high risk
14342,Tomika Sylvestre,1968-09-20,2290,92133,0,1631.0,1483.0,54,San Diego,San Diego County,CA,high risk
14343,Delfina Snider,1900-01-01,538,92195,1,1710.0,1555.0,122,San Diego,San Diego County,CA,high risk
14344,Rudolph Fredrickson,1979-12-18,2007,92177,1,2692.0,2154.0,43,San Diego,San Diego County,CA,high risk


In [173]:
# loking for the target columns after under sampling
ml_df_under_row['status'].value_counts()

low risk     2940
high risk    2940
Name: status, dtype: int64

In [174]:
#Saving the under sampling data for model
ml_df_under_row.to_csv('Resources/Cleaned_data/under_samp_row.csv', index=False)
ml_df_under_col.to_csv('Resources/Cleaned_data/under_samp_col.csv', index=False)

## Over Sampling Technique

In [175]:
#Over sampling for rows nan droped dataframe
ml_df_over_row=pd.concat([low_risk_rows, high_risk_rows.sample(n=len(low_risk_rows), replace=True)])
ml_df_over_row= ml_df_over_row.reset_index(drop=True)
ml_df_over_row

Unnamed: 0,name,dob,houseID,houseZip,paymentDate,paymentAmount,rentAmount,age,city,county,lat,lng,housingUnits,occupiedHousingUnits,medianIncome,population,state,status
0,Karima Germany,1951-05-23,1192,92154,1,1321.0,1321.0,71,San Diego,San Diego County,32.60,-117.00,21206.0,20202.0,57618.0,79708.0,CA,low risk
1,Agustina Spargo,1900-01-01,21,92111,1,2289.0,2289.0,122,San Diego,San Diego County,32.82,-117.15,17268.0,16498.0,57350.0,45096.0,CA,low risk
2,Clifton Ellwood,1993-11-02,430,92103,1,1233.0,1233.0,29,San Diego,San Diego County,32.75,-117.17,19080.0,17827.0,62092.0,31066.0,CA,low risk
3,Taneka Noto,1992-05-09,2216,92110,1,988.0,988.0,30,San Diego,San Diego County,32.77,-117.21,11426.0,10630.0,59719.0,25341.0,CA,low risk
4,Curtis Sher,1996-10-14,463,92130,1,1169.0,1169.0,26,San Diego,San Diego County,32.95,-117.20,18272.0,17528.0,127968.0,48940.0,CA,low risk
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41921,Ivan Berrey,1989-03-13,992,92113,0,1474.0,1282.0,33,San Diego,San Diego County,32.69,-117.12,13196.0,12315.0,27675.0,56066.0,CA,high risk
41922,Linsey Gallop,1900-01-01,1424,92103,0,1545.0,1344.0,122,San Diego,San Diego County,32.75,-117.17,19080.0,17827.0,62092.0,31066.0,CA,high risk
41923,Lynnette Menzie,1900-01-01,1500,92102,0,1652.0,1437.0,122,San Diego,San Diego County,32.72,-117.12,14987.0,13981.0,40557.0,43267.0,CA,high risk
41924,Lloyd Holbert,1900-01-01,1437,92115,0,1991.0,1810.0,122,San Diego,San Diego County,32.76,-117.07,22754.0,21251.0,41866.0,58560.0,CA,high risk


In [176]:
#Over sampling for columns nan droped dataframe
ml_df_over_col=pd.concat([low_risk_rows_col, high_risk_rows_col.sample(n=len(low_risk_rows_col), replace=True)])
ml_df_over_col= ml_df_over_col.reset_index(drop=True)
ml_df_over_col

Unnamed: 0,name,dob,houseID,houseZip,paymentDate,paymentAmount,rentAmount,age,city,county,state,status
0,Karima Germany,1951-05-23,1192,92154,1,1321.0,1321.0,71,San Diego,San Diego County,CA,low risk
1,Agustina Spargo,1900-01-01,21,92111,1,2289.0,2289.0,122,San Diego,San Diego County,CA,low risk
2,Lucilla Broderick,1900-01-01,1474,92159,1,1439.0,1439.0,122,San Diego,San Diego County,CA,low risk
3,Russ Mchale,1977-04-20,2015,92137,1,1744.0,1744.0,45,San Diego,San Diego County,CA,low risk
4,Carmelita Ritzer,1969-03-09,311,92136,1,1471.0,1471.0,53,San Diego,San Diego County,CA,low risk
...,...,...,...,...,...,...,...,...,...,...,...,...
105275,Beckie Balbuena,1984-06-14,187,92135,0,2165.0,1883.0,38,San Diego,San Diego County,CA,high risk
105276,Randi Keppler,1900-01-01,1912,92163,0,1480.0,1346.0,122,San Diego,San Diego County,CA,high risk
105277,Alex Earle,2001-11-02,43,92149,0,1713.0,1490.0,21,San Diego,San Diego County,CA,high risk
105278,Blair Schroder,1973-08-01,216,92122,0,1712.0,1557.0,49,San Diego,San Diego County,CA,high risk


In [177]:
# Loking for the target columns after over sampling technique
ml_df_over_col['status'].value_counts()

low risk     52640
high risk    52640
Name: status, dtype: int64

In [178]:
#Saving the over sampling data for model building
ml_df_over_row.to_csv('Resources/Cleaned_data/over_samp_row.csv', index=False)
ml_df_over_col.to_csv('Resources/Cleaned_data/over_samp_col.csv', index=False)