# S12 T01: Pipelines, grid search i text mining

## Descripció
Comencem a familiaritzar-nos amb Pipelines, grid search i text mining !!! Comencem amb uns quants exercicis bàsics

## Nivell 1
### - Exercici 1
Agafa el conjunt de dades que vulguis i realitza un pipeline i un gridsearch aplicant l'algorisme de Random Forest.

### - Exercici 2
Agafa un text en anglès que vulguis, i calcula'n la freqüència de les paraules

In [97]:
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from nltk.probability import FreqDist
from nltk.stem.snowball import SnowballStemmer


In [98]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [99]:
train_df.shape


(891, 12)

In [100]:
train_df.sample(10)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
851,852,0,3,"Svensson, Mr. Johan",male,74.0,0,0,347060,7.775,,S
42,43,0,3,"Kraeff, Mr. Theodor",male,,0,0,349253,7.8958,,C
288,289,1,2,"Hosono, Mr. Masabumi",male,42.0,0,0,237798,13.0,,S
473,474,1,2,"Jerwan, Mrs. Amin S (Marie Marthe Thuillard)",female,23.0,0,0,SC/AH Basle 541,13.7917,D,C
766,767,0,1,"Brewe, Dr. Arthur Jackson",male,,0,0,112379,39.6,,C
528,529,0,3,"Salonen, Mr. Johan Werner",male,39.0,0,0,3101296,7.925,,S
78,79,1,2,"Caldwell, Master. Alden Gates",male,0.83,0,2,248738,29.0,,S
194,195,1,1,"Brown, Mrs. James Joseph (Margaret Tobin)",female,44.0,0,0,PC 17610,27.7208,B4,C
83,84,0,1,"Carrau, Mr. Francisco M",male,28.0,0,0,113059,47.1,,S
699,700,0,3,"Humblen, Mr. Adolf Mathias Nicolai Olsen",male,42.0,0,0,348121,7.65,F G63,S


In [101]:
y_train = train_df.iloc[:, 1]
X_train = train_df.iloc[:, 2:].drop(['Name', 'Cabin', 'Ticket'], axis=1)
y_test = test_df.iloc[:, 1]
X_test = test_df.iloc[:, 1:].drop(['Name', 'Cabin', 'Ticket'], axis=1)


In [102]:
X_train = pd.get_dummies(X_train, drop_first=True)
X_test = pd.get_dummies(X_test, drop_first=True)


In [103]:
pipe = Pipeline([
    ('SImp', SimpleImputer(strategy='median')),
    ('RandForestC', RandomForestClassifier())
])


In [104]:
params = {
    'RandForestC__max_depth': [6, 7, 8, 9],
    'RandForestC__n_estimators': [50,  60,  70,  80,  90, 100],
    'RandForestC__min_samples_split': [2, 5, 10],
    'RandForestC__bootstrap': [True, False]
}


In [105]:
cv_model = GridSearchCV(
    cv=5,
    estimator=pipe,
    param_grid=params
)

In [106]:
cv_model.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('SImp',
                                        SimpleImputer(strategy='median')),
                                       ('RandForestC',
                                        RandomForestClassifier())]),
             param_grid={'RandForestC__bootstrap': [True, False],
                         'RandForestC__max_depth': [6, 7, 8, 9],
                         'RandForestC__min_samples_split': [2, 5, 10],
                         'RandForestC__n_estimators': [50, 60, 70, 80, 90,
                                                       100]})

In [107]:
cv_model.best_params_

{'RandForestC__bootstrap': True,
 'RandForestC__max_depth': 9,
 'RandForestC__min_samples_split': 2,
 'RandForestC__n_estimators': 80}

In [108]:
predictions = cv_model.predict(X_test)
errors = abs(predictions - y_test)
mape = 100 * np.mean(errors / y_test)
accuracy = 100 - mape
print('Model Performance')
print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
print('Accuracy = {:0.2f}%.'.format(accuracy))

Model Performance
Average Error: 1.9378 degrees.
Accuracy = 21.73%.



### - Exercici 2
Agafa un text en anglès que vulguis, i calcula'n la freqüència de les paraules

In [109]:
from urllib import request
url = "http://www.gutenberg.org/files/2554/2554-0.txt"
response = request.urlopen(url)
text = response.read().decode('utf8')
text = re.sub("[^-9A-Za-z ]", "" , text)

In [110]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/marcr/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [111]:
text_token=nltk.tokenize.word_tokenize(text)

In [112]:
fdist = FreqDist(text_token)
fdist

FreqDist({'the': 6400, 'and': 5426, 'to': 4690, 'a': 3946, 'of': 3366, 'I': 3301, 'he': 3108, 'you': 2930, 'in': 2749, 'was': 2440, ...})


## Nivell 2
### - Exercici 1
Treu les stopwords i realitza stemming al teu conjunt de dades.

In [113]:

nltk.download('stopwords')
from nltk.corpus import stopwords
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to /home/marcr/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [114]:
stop_words=stopwords.words('english')
text_filtered = [w for w in text_token if not w.lower() in stop_words]
text_filtered[:10]

['Project',
 'Gutenberg',
 'eBook',
 'Crime',
 'Punishment',
 'Fyodor',
 'DostoevskyThis',
 'eBook',
 'use',
 'anyone']

In [115]:
stemmer = SnowballStemmer("english", ignore_stopwords=True)
text_stem = [stemmer.stem(word) for word in text_token]
text_stem[1::100]

['project',
 'englishcharact',
 'petersburg',
 'the',
 'you',
 'case',
 '9',
 'his',
 'more',
 'a',
 'inhimself',
 'he',
 'or',
 'pain',
 'not',
 'charact',
 'somewher',
 'this',
 'mani',
 'of',
 'the',
 'porterswho',
 'bell',
 'him',
 'i',
 'of',
 'a',
 'thought',
 'the',
 'was',
 'the',
 'say',
 'say',
 'in',
 'repuls',
 'men',
 'is',
 'the',
 'sleep',
 'apart',
 'might',
 'thecount',
 'because',
 'there',
 'brush',
 'titularcounsellor',
 'and',
 'a',
 'you',
 'general',
 'to',
 'you',
 'where',
 'on',
 'began',
 'magnanim',
 'the',
 'children',
 'that',
 'of',
 'be',
 'paid',
 'she',
 'dont',
 'do',
 'is',
 'read',
 'has',
 'thelittl',
 'dont',
 'not',
 'both',
 'our',
 'give',
 'man',
 'servic',
 'of',
 'to',
 'visit',
 'degre',
 'weve',
 'littl',
 'marmeladov',
 'restor',
 'and',
 'to',
 'for',
 'me',
 'to',
 'and',
 'i',
 'oh',
 'in',
 'two',
 'they',
 'it',
 'full',
 'tallslim',
 'come',
 'tall',
 'stranger',
 'themoney',
 'strike',
 'with',
 'was',
 'have',
 'and',
 'bilious',


## Nivell 3
### - Exercici 1
Realitza sentiment analysis al teu conjunt de dades.

In [118]:
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

sia = SentimentIntensityAnalyzer()
sia.polarity_scores(text)

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/marcr/nltk_data...


{'neg': 0.11, 'neu': 0.789, 'pos': 0.101, 'compound': -1.0}

## Recursos
Recursos de l'aula i https://www.nltk.org

## Objectius
Utilitzar pipelines i grid search
Realitzar mineria de texts