# Feature Engineering

This part of the project uses the previously cleaned dataset. The steps are described [here]().

**Techniques used:**
* Pandas

## Introduce clean dataset

In [1]:
import pandas as pd

loans = pd.read_csv('filtered_loans_2007.csv')

### Count null values

In [2]:
null_counts = loans.isnull().sum()

print(null_counts)

loan_amnt                  0
term                       0
int_rate                   0
installment                0
emp_length              1036
home_ownership             0
annual_inc                 0
verification_status        0
loan_status                0
purpose                    0
title                     11
addr_state                 0
dti                        0
delinq_2yrs                0
earliest_cr_line           0
inq_last_6mths             0
open_acc                   0
pub_rec                    0
revol_bal                  0
revol_util                50
total_acc                  0
last_credit_pull_d         2
pub_rec_bankruptcies     697
dtype: int64


### Remove columns with more than 1% missing values

In [3]:
loans = loans.drop('pub_rec_bankruptcies', axis = 1)
loans = loans.dropna(axis = 0)
print(loans.dtypes.value_counts())

object     11
float64    10
int64       1
dtype: int64


### Select *object* columns 

In [4]:
object_columns_df = loans.select_dtypes(include = ['object'])

print(object_columns_df.iloc[0])

term                     36 months
int_rate                    10.65%
emp_length               10+ years
home_ownership                RENT
verification_status       Verified
purpose                credit_card
title                     Computer
addr_state                      AZ
earliest_cr_line          Jan-1985
revol_util                   83.7%
last_credit_pull_d        Jun-2016
Name: 0, dtype: object


In [5]:
cols = ['home_ownership', 'verification_status', 'emp_length', 'term', 'addr_state']

for col in cols:
    print(loans[col].value_counts())

RENT        18112
MORTGAGE    16686
OWN          2778
OTHER          96
NONE            3
Name: home_ownership, dtype: int64
Not Verified       16281
Verified           11856
Source Verified     9538
Name: verification_status, dtype: int64
10+ years    8545
< 1 year     4513
2 years      4303
3 years      4022
4 years      3353
5 years      3202
1 year       3176
6 years      2177
7 years      1714
8 years      1442
9 years      1228
Name: emp_length, dtype: int64
 36 months    28234
 60 months     9441
Name: term, dtype: int64
CA    6776
NY    3614
FL    2704
TX    2613
NJ    1776
IL    1447
PA    1442
VA    1347
GA    1323
MA    1272
OH    1149
MD    1008
AZ     807
WA     788
CO     748
NC     729
CT     711
MI     678
MO     648
MN     581
NV     466
SC     454
WI     427
OR     422
LA     420
AL     420
KY     311
OK     285
KS     249
UT     249
AR     229
DC     209
RI     194
NM     180
WV     164
HI     162
NH     157
DE     110
MT      77
WY      76
AK      76
SD      60
VT  

In [6]:
print(loans['purpose'].value_counts())
print(loans['title'].value_counts())

debt_consolidation    17751
credit_card            4911
other                  3711
home_improvement       2808
major_purchase         2083
small_business         1719
car                    1459
wedding                 916
medical                 655
moving                  552
house                   356
vacation                348
educational             312
renewable_energy         94
Name: purpose, dtype: int64
Debt Consolidation                        2068
Debt Consolidation Loan                   1599
Personal Loan                              624
Consolidation                              488
debt consolidation                         466
Credit Card Consolidation                  345
Home Improvement                           336
Debt consolidation                         314
Small Business Loan                        298
Credit Card Loan                           294
Personal                                   290
Consolidation Loan                         250
Home Improvement

### Clean *emp_length* column

In [7]:
mapping_dict = {
    "emp_length": {
        "10+ years": 10,
        "9 years": 9,
        "8 years": 8,
        "7 years": 7,
        "6 years": 6,
        "5 years": 5,
        "4 years": 4,
        "3 years": 3,
        "2 years": 2,
        "1 year": 1,
        "< 1 year": 0,
        "n/a": 0
    }
}

cols = ['last_credit_pull_d', 'addr_state', 'title', 'earliest_cr_line']

loans = loans.drop(cols, axis = 1)

loans['int_rate'] = loans['int_rate'].str.rstrip('%').astype('float')
loans['revol_util'] = loans['revol_util'].str.rstrip('%').astype('float')

loans = loans.replace(mapping_dict)

### Encode columns as integers

In [8]:
cols = ['home_ownership', 'verification_status','purpose', 'term']

dummy_df = pd.get_dummies(loans[cols])

loans = pd.concat([loans, dummy_df], axis = 1)
loans = loans.drop(cols, axis = 1)