In [1]:
import pandas as pd

from src.cleaning_functions import *

## Nike data.

In [2]:
nike = pd.read_csv('data/nike.csv')

In [3]:
nike_tw = pd.DataFrame(nike.text)

### Exploración inicial de los datos:

In [4]:
nike_tw.shape

(29804, 1)

In [5]:
nike_tw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29804 entries, 0 to 29803
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    29804 non-null  object
dtypes: object(1)
memory usage: 233.0+ KB


In [6]:
nike_tw.describe()

Unnamed: 0,text
count,29804
unique,29276
top,@whatudohazz nike
freq,59


In [7]:
nike_tw.isnull().sum()  # No existen nulos.

text    0
dtype: int64

In [8]:
nike_tw.duplicated().sum() # Encontramos 528 valores duplicados.

528

In [9]:
drop_duplicates(nike_tw) # Eliminamos los valores duplicados.

In [10]:
nike_tw.shape

(29276, 1)

### Preparing Data.

**Limpieza básica:**
- Pasamos todo a minúsculas.
- Eliminamos usuarios ('@ + user' ya que en principio el nombre de los usuarios no aporta información relevante para el modelo).
- Eliminamos urls.
- Eliminamos valores no alfabéticos (# (aunque manteniendo el texto que viene después ya que puede contener información relevante), números...).

Esta limpieza se ejecutará a través de nuestra función `basic_cleanning()`.

In [11]:
nike_tw.head()

Unnamed: 0,text
0,@ZubyMusic @LPMisesCaucus Nobody cares about N...
1,@PoojaPraharaj @IndiainUkraine @PMOIndia @MEAI...
2,@DocumentWomen Nike Okundaye. Also knowns Nike...
3,Day 4 of #maxmadness #AirMaxMonth \nNike Air M...
4,@MagMr44 @soleguru @nikestore @Nike @SneakerAd...


In [12]:
nike_tw = pd.DataFrame(nike_tw.text.apply(basic_cleanning))

In [13]:
nike_tw.head()

Unnamed: 0,text
0,nobody cares about nike in russia russia is al...
1,ye green nike hoodie waala weeks pehle putin s...
2,nike okundaye also knowns nike twins seven sev...
3,day of air max month nike air max viotech and ...
4,thank you sir


In [14]:
nike_tw.text[1]

'ye green nike hoodie waala weeks pehle putin se disappointed tha he said putin ne dhoka de diya he was expecting thrill from putin ab jab thrill mil raha hai to darkhwaast kar raha hai he is so shameless that he is still requesting and coming in video'

In [15]:
basic_cleanning("@Ironhack's- #Q website 776-is http://ironhack.com [(2018)]")

'q website is'

In [18]:
from nltk.corpus import wordnet

In [19]:
syns = wordnet.synsets("price")
print(syns)

[Synset('monetary_value.n.01'), Synset('price.n.02'), Synset('price.n.03'), Synset('price.n.04'), Synset('price.n.05'), Synset('price.n.06'), Synset('price.n.07'), Synset('price.v.01'), Synset('price.v.02')]


In [31]:
price_list = ['bill', 'carfare', 'charge', 'cost', 'cost', 'damage', 'expenditure', 'expense',
              'fare', 'fee', 'invoice', 'levy', 'monetary', 'money', 'outlay', 'pay', 'price', 'pricy',
              'quotation', 'rate', 'settlement', 'sum', 'terms', 'toll', 'valuation', 'value',
              'money', 'cash', 'wherewithal', 'means', 'funds', 'capital', 'lucre', 'banknote', 'note', 'noting',
              'coin', 'copper', 'silver', 'cent', 'penny', 'pence', 'dollar', 'buck', 'smacker', 'plonk', 'plunk',
              'currency', 'sterling', 'bill', 'dough', 'bread', 'rich', 'dime', 'gold',
              'loot', 'plunder', 'shekels', 'moolah', 'boodle', 'dibs', 'brass', 'gelt', 'ducats', 'rhino', 'gravy',
              'oof', 'dosh', 'brass', 'lolly', 'spondulicks', 'wonga', 'ackers', 'dinero', 'euros', 'greenbacks',
              'simoleons', 'mazuma', 'splosh', 'worthy', 'cheap', 'economic', 'economy', 'competitive', 'affordable',
              'budget', 'bargain', 'sale', 'discount', 'expensive', 'rob', 'steal', 'theft', 'thievery', 'fraud']


sorted(price_list)

['ackers',
 'affordable',
 'banknote',
 'bargain',
 'bill',
 'bill',
 'boodle',
 'brass',
 'brass',
 'bread',
 'buck',
 'budget',
 'capital',
 'carfare',
 'cash',
 'cent',
 'charge',
 'cheap',
 'coin',
 'competitive',
 'copper',
 'cost',
 'cost',
 'currency',
 'damage',
 'dibs',
 'dime',
 'dinero',
 'discount',
 'dollar',
 'dosh',
 'dough',
 'ducats',
 'economic',
 'economy',
 'euros',
 'expenditure',
 'expense',
 'expensive',
 'fare',
 'fee',
 'fraud',
 'funds',
 'gelt',
 'gold',
 'gravy',
 'greenbacks',
 'invoice',
 'levy',
 'lolly',
 'loot',
 'lucre',
 'mazuma',
 'means',
 'monetary',
 'money',
 'money',
 'moolah',
 'note',
 'noting',
 'oof',
 'outlay',
 'pay',
 'pence',
 'penny',
 'plonk',
 'plunder',
 'plunk',
 'price',
 'pricy',
 'quotation',
 'rate',
 'rhino',
 'rich',
 'rob',
 'sale',
 'settlement',
 'shekels',
 'silver',
 'simoleons',
 'smacker',
 'splosh',
 'spondulicks',
 'steal',
 'sterling',
 'sum',
 'terms',
 'theft',
 'thievery',
 'toll',
 'valuation',
 'value',
 'wherew