In [1]:
!pip install textstat

Defaulting to user installation because normal site-packages is not writeable


In [2]:
!pip install -U spacy

Defaulting to user installation because normal site-packages is not writeable


In [3]:
!pip install nltk

Defaulting to user installation because normal site-packages is not writeable


In [4]:
import numpy as np
import pandas as pd
import string
import re
import warnings

import textstat
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

import spacy

import category_encoders as ce
import joblib

warnings.filterwarnings('ignore')

  from pandas import Int64Index as NumericIndex


In [5]:
train_df = pd.read_csv('../../input/train.csv')
test_df = pd.read_csv('../../input/test.csv')
target = train_df['LOAN_AMOUNT']
train_df = train_df.drop('LOAN_AMOUNT', axis=1)

## DESCRIPTION_TRANSLATEDの欠損値の置換

In [6]:
train_df.loc[train_df['DESCRIPTION_TRANSLATED'].isna(), 'DESCRIPTION_TRANSLATED'] = train_df.loc[train_df['DESCRIPTION_TRANSLATED'].isna(), 'DESCRIPTION']

In [7]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91333 entries, 0 to 91332
Data columns (total 17 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   LOAN_ID                          91333 non-null  int64  
 1   ORIGINAL_LANGUAGE                91333 non-null  object 
 2   DESCRIPTION                      91333 non-null  object 
 3   DESCRIPTION_TRANSLATED           91333 non-null  object 
 4   IMAGE_ID                         91333 non-null  int64  
 5   ACTIVITY_NAME                    91333 non-null  object 
 6   SECTOR_NAME                      91333 non-null  object 
 7   LOAN_USE                         91333 non-null  object 
 8   COUNTRY_CODE                     91333 non-null  object 
 9   COUNTRY_NAME                     91333 non-null  object 
 10  TOWN_NAME                        88573 non-null  object 
 11  CURRENCY_POLICY                  91333 non-null  object 
 12  CURRENCY_EXCHANGE_

##  DESCRIPTIONとDESCRIPTION_TRANSLATEDの前処理

In [8]:
puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£',
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', '\n', '\xa0', '\t',
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', '\u3000', '\u202f',
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', '«',
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]


html_tags = ['<p>', '</p>', '<table>', '</table>', '<tr>', '</tr>', '<ul>', '<ol>', '<dl>', '</ul>', '</ol>',
             '</dl>', '<li>', '<dd>', '<dt>', '</li>', '</dd>', '</dt>', '<h1>', '</h1>',
             '<br>', '<br/>', '<br />','<strong>', '</strong>', '<span>', '</span>', '<blockquote>', '</blockquote>',
             '<pre>', '</pre>', '<div>', '</div>', '<h2>', '</h2>', '<h3>', '</h3>', '<h4>', '</h4>', '<h5>', '</h5>',
             '<h6>', '</h6>', '<blck>', '<pr>', '<code>', '<th>', '</th>', '<td>', '</td>', '<em>', '</em>']

empty_expressions = ['&lt;', '&gt;', '&amp;', '&nbsp;', 
                     '&emsp;', '&ndash;', '&mdash;', '&ensp;'
                     '&quot;', '&#39;']

In [9]:
def pre_preprocess(x):
    return str(x).lower()

def rm_spaces(text):
    spaces = ['\u200b', '\u200e', '\u202a', '\u2009', '\u2028', '\u202c', '\ufeff', '\uf0d8', '\u2061', '\u3000', '\x10', '\x7f', '\x9d', '\xad',
              '\x97', '\x9c', '\x8b', '\x81', '\x80', '\x8c', '\x85', '\x92', '\x88', '\x8d', '\x80', '\x8e', '\x9a', '\x94', '\xa0', 
              '\x8f', '\x82', '\x8a', '\x93', '\x90', '\x83', '\x96', '\x9b', '\x9e', '\x99', '\x87', '\x84', '\x9f',
             ]
    for space in spaces:
            text = text.replace(space, ' ')
    return text

def remove_urls(x):
    x = re.sub(r'(https?://[a-zA-Z0-9.-]*)', r'', x)

    # original
    x = re.sub(r'(quote=\w+\s?\w+;?\w+)', r'', x)
    return x

def clean_puncts(x):
    for punct in puncts:
        x = x.replace(punct, f' {punct} ')
    return x

def clean_html_tags(x, stop_words=[]):      
    for r in html_tags:
        x = x.replace(r, '')
    for r in empty_expressions:
        x = x.replace(r, ' ')
    for r in stop_words:
        x = x.replace(r, '')
    return x

def preprocess(data):
    data = data.apply(lambda x: pre_preprocess(x))
    data = data.apply(lambda x: rm_spaces(x))
    data = data.apply(lambda x: remove_urls(x))
    data = data.apply(lambda x: clean_html_tags(x))
    data = data.apply(lambda x: clean_puncts(x))
    
    return data

In [10]:
train_df['clean_DESCRIPTION_TRANSLATED'] = preprocess(train_df['DESCRIPTION_TRANSLATED'])
test_df['clean_DESCRIPTION_TRANSLATED'] = preprocess(test_df['DESCRIPTION_TRANSLATED'])

In [22]:
train_df.loc[0, 'clean_DESCRIPTION_TRANSLATED']

'teodora is a 50 - year - old married woman from the town of maribojoc .  she weaves nipa palm  ( used as roof material for thatched houses or dwelling )  .  she has been engaged in this kind of livelihood for more than ten years now ,  serving town residents . this livelihood is a lucrative venture in her place .  right now ,  she needs financial support in order to improve the flow of her business ,  its sales ,  and profit . a loan worth of 5 , 000 php will be used to purchase materials like nipa palm ,  bamboo sticks and nito  ( dried paplam  ( for binding  )  . in the future ,  teodora dreams of expanding her business to make it more productive and profitable to support the schooling of her children . '

## カテゴリエンコーディング

In [12]:
# category list
OTHER_COUNTRY = ['EG',
                 'MZ',
                 'HT',
                 'MX',
                 'BO',
                 'US',
                 'TO',
                 'SB',
                 'AL',
                 'CR',
                 'GE',
                 'SL',
                 'ZM',
                 'FJ',
                 'BR',
                 'MD',
                 'ML',
                 'CM',
                 'MW',
                 'DO',
                 'XK',
                 'TR',
                 'TH',
                 'NP',
                 'PG',
                 'PA',
                 'PR',
                 'LS',
                 'IL',
                 'AM']

OTHER_SECTOR_NAME = ['Transportation',
                    'Construction',
                    'Manufacturing',
                    'Entertainment',
                    'Wholesale']

In [13]:
train_df['SECTOR_NAME'] = train_df['SECTOR_NAME'].apply(lambda x: x if x not in OTHER_SECTOR_NAME else 'other')
train_df['COUNTRY_CODE'] = train_df['COUNTRY_CODE'].apply(lambda x: x if x not in OTHER_COUNTRY else 'other')

test_df['SECTOR_NAME'] = test_df['SECTOR_NAME'].apply(lambda x: x if x not in OTHER_SECTOR_NAME else 'other')
test_df['COUNTRY_CODE'] = test_df['COUNTRY_CODE'].apply(lambda x: x if x not in OTHER_COUNTRY else 'other')

In [14]:
df = pd.concat([train_df, test_df]).reset_index(drop=True)
df.head()

Unnamed: 0,LOAN_ID,ORIGINAL_LANGUAGE,DESCRIPTION,DESCRIPTION_TRANSLATED,IMAGE_ID,ACTIVITY_NAME,SECTOR_NAME,LOAN_USE,COUNTRY_CODE,COUNTRY_NAME,TOWN_NAME,CURRENCY_POLICY,CURRENCY_EXCHANGE_COVERAGE_RATE,CURRENCY,TAGS,REPAYMENT_INTERVAL,DISTRIBUTION_MODEL,clean_DESCRIPTION_TRANSLATED
0,1733169,English,Teodora is a 50-year-old married woman from th...,Teodora is a 50-year-old married woman from th...,3115271,Weaving,Arts,"to purchase materials like nipa palm, bamboo ...",PH,Philippines,"Maribojoc, Bohol",shared,0.1,PHP,#Elderly,monthly,field_partner,teodora is a 50 - year - old married woman fro...
1,1546998,English,Diego is 32 years old and lives in the municip...,Diego is 32 years old and lives in the municip...,2870403,Barber Shop,Services,"to buy two hair clippers, a new barber chair, ...",CO,Colombia,Apartadó,shared,0.1,COP,"user_favorite, user_favorite",monthly,field_partner,diego is 32 years old and lives in the municip...
2,1808517,Spanish,"Osman, es un joven de 27 años de edad, soltero...","Osman is a young man, 27 years old, single, an...",3215705,Farming,Agriculture,to purchase sacks of fertilizers to care for a...,HN,Honduras,"Nueva Frontera, Santa Barbara.",shared,0.1,HNL,,bullet,field_partner,"osman is a young man , 27 years old , single..."
3,1452940,English,"His name is Nino, 31 years old, married to Che...","His name is Nino, 31 years old, married to Che...",2745031,Motorcycle Transport,other,"to pay for fuel, tires and change oil for his ...",PH,Philippines,"Silang, Cavite",shared,0.1,PHP,user_favorite,monthly,field_partner,"his name is nino , 31 years old , married to..."
4,1778420,English,"Pictured above is Teresa, often described as a...","Pictured above is Teresa, often described as a...",3083800,Farming,Agriculture,to purchase hybrid seeds and fertilizer to imp...,KE,Kenya,Mumias,shared,0.1,KES,"#Eco-friendly, #Sustainable Ag, #Parent, #Elde...",bullet,field_partner,"pictured above is teresa , often described as..."


In [15]:
label_enc_features = ['SECTOR_NAME', 'COUNTRY_CODE']
ce_label_enc = ce.OrdinalEncoder(cols=label_enc_features, handle_unknown='impute')
ce_label_enc.fit(df)

train_df = ce_label_enc.transform(train_df)
test_df = ce_label_enc.transform(test_df)

joblib.dump(ce_label_enc, 'ce_label_enc.joblib')

['ce_label_enc.joblib']

In [16]:
train_df['SECTOR_NAME'] = train_df['SECTOR_NAME'] - 1
train_df['COUNTRY_CODE'] = train_df['COUNTRY_CODE'] - 1

test_df['SECTOR_NAME'] = test_df['SECTOR_NAME'] - 1
test_df['COUNTRY_CODE'] = test_df['COUNTRY_CODE'] - 1

In [17]:
train_df['SECTOR_NAME'].value_counts()

2     24965
4     18701
6     15155
7      8185
1      5491
8      4372
9      4124
5      3533
3      3122
0      1977
10     1708
Name: SECTOR_NAME, dtype: int64

In [18]:
test_df['SECTOR_NAME'].value_counts()

2     29058
4     20790
6     17756
7      4816
1      4255
8      3924
5      3099
9      2746
3      2279
0      1664
10     1435
Name: SECTOR_NAME, dtype: int64

In [19]:
train_df['LOAN_AMOUNT'] = target

In [20]:
train_df.head()

Unnamed: 0,LOAN_ID,ORIGINAL_LANGUAGE,DESCRIPTION,DESCRIPTION_TRANSLATED,IMAGE_ID,ACTIVITY_NAME,SECTOR_NAME,LOAN_USE,COUNTRY_CODE,COUNTRY_NAME,TOWN_NAME,CURRENCY_POLICY,CURRENCY_EXCHANGE_COVERAGE_RATE,CURRENCY,TAGS,REPAYMENT_INTERVAL,DISTRIBUTION_MODEL,clean_DESCRIPTION_TRANSLATED,LOAN_AMOUNT
0,1733169,English,Teodora is a 50-year-old married woman from th...,Teodora is a 50-year-old married woman from th...,3115271,Weaving,0,"to purchase materials like nipa palm, bamboo ...",0,Philippines,"Maribojoc, Bohol",shared,0.1,PHP,#Elderly,monthly,field_partner,teodora is a 50 - year - old married woman fro...,100
1,1546998,English,Diego is 32 years old and lives in the municip...,Diego is 32 years old and lives in the municip...,2870403,Barber Shop,1,"to buy two hair clippers, a new barber chair, ...",1,Colombia,Apartadó,shared,0.1,COP,"user_favorite, user_favorite",monthly,field_partner,diego is 32 years old and lives in the municip...,1350
2,1808517,Spanish,"Osman, es un joven de 27 años de edad, soltero...","Osman is a young man, 27 years old, single, an...",3215705,Farming,2,to purchase sacks of fertilizers to care for a...,2,Honduras,"Nueva Frontera, Santa Barbara.",shared,0.1,HNL,,bullet,field_partner,"osman is a young man , 27 years old , single...",225
3,1452940,English,"His name is Nino, 31 years old, married to Che...","His name is Nino, 31 years old, married to Che...",2745031,Motorcycle Transport,3,"to pay for fuel, tires and change oil for his ...",0,Philippines,"Silang, Cavite",shared,0.1,PHP,user_favorite,monthly,field_partner,"his name is nino , 31 years old , married to...",350
4,1778420,English,"Pictured above is Teresa, often described as a...","Pictured above is Teresa, often described as a...",3083800,Farming,2,to purchase hybrid seeds and fertilizer to imp...,3,Kenya,Mumias,shared,0.1,KES,"#Eco-friendly, #Sustainable Ag, #Parent, #Elde...",bullet,field_partner,"pictured above is teresa , often described as...",625


In [21]:
train_df.to_csv('preprocess_train.csv', index=False)
test_df.to_csv('preprocess_test.csv', index=False)

In [23]:
test_df.loc[0, 'clean_DESCRIPTION_TRANSLATED']

'marcela is 69 years old and married with ten children .  marcela works very hard to provide for them . she runs a general store in the philippines and requested a 30 , 000 php loan through nwtf to buy items to sell like canned goods and personal care products . she has been in this business for 25 years . in the future marcela would like to build and expand her business to secure the future of her family . '

In [24]:
train_df.loc[0, 'clean_DESCRIPTION_TRANSLATED']

'teodora is a 50 - year - old married woman from the town of maribojoc .  she weaves nipa palm  ( used as roof material for thatched houses or dwelling )  .  she has been engaged in this kind of livelihood for more than ten years now ,  serving town residents . this livelihood is a lucrative venture in her place .  right now ,  she needs financial support in order to improve the flow of her business ,  its sales ,  and profit . a loan worth of 5 , 000 php will be used to purchase materials like nipa palm ,  bamboo sticks and nito  ( dried paplam  ( for binding  )  . in the future ,  teodora dreams of expanding her business to make it more productive and profitable to support the schooling of her children . '