In [1]:
%load_ext autoreload
%autoreload 2

In [13]:
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\anapm\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\anapm\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [20]:
from sklearn.pipeline import Pipeline
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
import numpy as np

In [2]:
from transformers import RemoveWords, GetTags, GetBase, Limpieza, common_tag_list, Tokenize, UnTokenize

In [3]:
data = pd.read_csv(r'datasets\\dontpatronizeme_pcl.tsv', skiprows=4, sep='\t', header=None)
data.head()

Unnamed: 0,0,1,2,3,4,5
0,5,@@1494111,refugee,ca,""" Just like we received migrants fleeing El Sa...",0
1,6,@@9382277,in-need,in,"To bring down high blood sugar levels , insuli...",0
2,7,@@7562079,refugee,za,The European Union is making an historic mista...,0
3,8,@@23663488,hopeless,nz,""" They 're either hopeless for being beaten by...",0
4,9,@@3449225,homeless,ph,"NUEVA ERA , Ilocos Norte - No family shall be ...",1


In [16]:
df = pd.DataFrame(columns=['text','PCL'],
                 data = [
                     [['Murder','rate','has','been','raising','in','the','last','5','years'],0],
                     [['we','have','to','take','care','of','the','lovely','women'],1],
                     [['1st','minister','announced','yesterday','a','new','policy'],0],
                     [['the','inmigrants','have','endured','the','worst','but','they','smile','everyday'],1],
                     [['more','cases','of','familiar','violence','have','been','reported'],0],
                     [['Sri','Lankan','norms','and','culture','inhibit','women','development'],0],
                     [['Anja','Ringgren','Loven','I',"can't",'find','a','word','to','describe'],1],
                     [['Is','the','raising','of','prices','the','main','reason','for','vulnerability'],0],
                     [['In','Lybia','today','there','are','countless','number','of'],0],
                 ])
X = df.text
y = df.PCL
pipe1 = Pipeline(steps=[('limpieza', Limpieza()),
                         ('tokenize', Tokenize()),
                        ('remove_stopwords', RemoveWords()),
                        ('obtener_tags', GetTags(tag_list = common_tag_list)),
                         ('get_base', GetBase()),
                         ('untokenize', UnTokenize()),
                         ('tfidf', TfidfVectorizer())
                         ])
res = pipe1.fit_transform(data.iloc[:,4],y)

0        [Just, like, we, received, migrants, fleeing, ...
1        [To, bring, down, high, blood, sugar, levels, ...
2        [The, European, Union, is, making, an, histori...
3        [Theyre, either, hopeless, for, being, beaten,...
4        [NUEVA, ERA, Ilocos, Norte, -, No, family, sha...
                               ...                        
10460    [Sri, Lankan, norms, and, culture, inhibit, wo...
10461    [He, added, that, the, AFP, will, continue, to...
10462    [She, has, one, huge, platform, and, informati...
10463    [Anja, Ringgren, Loven, I, cant, find, a, word...
10464    [Guinness, World, Record, of, lbs, of, -layer,...
Name: 4, Length: 10464, dtype: object


In [23]:
res.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [5]:
limpi = Limpieza().fit_transform(data.iloc[:,4])
remo = RemoveWords().fit_transform(limpi)
remo

"|,|!|:|\d|\.|'
0        [, Just, like, we, received, migrants, fleeing...
1        [To, bring, down, high, blood, sugar, levels, ...
2        [The, European, Union, is, making, an, histori...
3        [, Theyre, either, hopeless, for, being, beate...
4        [NUEVA, ERA, Ilocos, Norte, -, No, family, sha...
                               ...                        
10460    [Sri, Lankan, norms, and, culture, inhibit, wo...
10461    [He, added, that, the, AFP, will, continue, to...
10462    [, She, has, one, huge, platform, and, informa...
10463    [, Anja, Ringgren, Loven, I, cant, find, a, wo...
10464    [, Guinness, World, Record, of, lbs, of, -laye...
Name: 4, Length: 10464, dtype: object


0        [, like, received, migrants, fleeing, El, Salv...
1        [bring, high, blood, sugar, levels, insulin, n...
2        [European, Union, making, historic, mistake, h...
3        [, Theyre, either, hopeless, beaten, -year-old...
4        [NUEVA, ERA, Ilocos, Norte, -, family, shall, ...
                               ...                        
10460    [Sri, Lankan, norms, culture, inhibit, women, ...
10461    [added, AFP, continue, bank, application, whol...
10462    [, one, huge, platform, information, go, place...
10463    [, Anja, Ringgren, Loven, cant, find, word, de...
10464    [, Guinness, World, Record, lbs, -layer, munch...
Name: 4, Length: 10464, dtype: object

In [77]:
limpi.dropna().isna().sum()

0

In [74]:
def f(l):
    try:
        return [w for w in l ]
    except:
        print(l)    

In [75]:
limpi.apply(f)

nan


0        [, Just, like, we, received, migrants, fleeing...
1        [To, bring, down, high, blood, sugar, levels, ...
2        [The, European, Union, is, making, an, histori...
3        [, Theyre, either, hopeless, for, being, beate...
4        [NUEVA, ERA, Ilocos, Norte, -, No, family, sha...
                               ...                        
10460    [Sri, Lankan, norms, and, culture, inhibit, wo...
10461    [He, added, that, the, AFP, will, continue, to...
10462    [, She, has, one, huge, platform, and, informa...
10463    [, Anja, Ringgren, Loven, I, cant, find, a, wo...
10464    [, Guinness, World, Record, of, lbs, of, -laye...
Name: 4, Length: 10465, dtype: object

In [72]:
limpi.to_csv('ejemplo\\limpi.csv', index= False)