# NIRS (model)

In [10]:
import utils
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline
utils.seed_everything(42)

In [11]:
%run data-with-text-preprocessing.ipynb

[nltk_data] Downloading package punkt to /home/lucamodica/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/lucamodica/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to /home/lucamodica/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/lucamodica/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/lucamodica/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## Reading, preprocess and represent data (to test)

In [12]:
df_reviews = pd.read_csv('data/products_sampled.csv')
df_products = pd.read_csv('data/reviews_sampled.csv')

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
import string, re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import spacy
import nltk

nlp = spacy.load('en_core_web_sm')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('words')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

# LOADING SPACY ENGLISH LANGUAGE MODEL
nlp = spacy.load("en_core_web_sm")

# LOADING WORD LIST FROM NLTK
words = set(nltk.corpus.words.words())

ModuleNotFoundError: No module named 'spacy'

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

covid_vaccine_keywords = [
    # Correct spellings
    "vaccine", "vaccination", "covid", "covid-19", "coronavirus", "sars-cov-2",
    "pfizer", "moderna", "astrazeneca", "johnson & johnson", "j&j", "sinovac",
    "sputnik", "novavax", "vax", "jab", "immunization", "dose", "booster",
    "mrna", "side effects", "efficacy", "immunity", "herd immunity", "antibody",
    "antibodies", "shot", "inoculation", "clinical trial", "rollout", "pharma",
    "pharmaceutical", "vaxx", "antivax", "antivaxxer", "vaxxed", "unvaccinated",
    "fully vaccinated", "vaccine passport", "vaccine mandate", "immunocompromised",

    # Common misspellings and social media abbreviations/slang
    "vaxine", "vaccin", "covid19", "corona virus", "cov19", "cov-19",
    "pfzer", "modrna", "astra zeneca", "astrazneca", "jnj", "j & j", "sinopharm",
    "sputnic", "novax", "vac", "vaxination", "imunization", "dose", "boster",
    "mrna vaccine", "side efects", "eficacy", "imunity", "herd imunity", "antibody",
    "antybodies", "shott", "inoculation", "clincal trial", "rolout", "farmaceutical",
    "pharmaceutic", "vaxx", "antivax", "antivaxer", "vaxxed", "unvaxed", "nonvaxxed",
    "vaccine card", "vaccine pass", "mandate vax", "immunocompromise",
]

class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return [self._preprocess(text) for text in X]

    def _preprocess(self, text):
        # Lowercasing
        text = text.lower()
        # Remove accented characters
        text = unidecode(text)
        
        # NER Tagging
        # doc = nlp(text)
        # entities = []
        # for ent in doc.ents:
        #     if any(keyword.lower() in ent.text.lower() for keyword in covid_vaccine_keywords):
        #         entities.append("[COVID_VACCINE]")
        #     else:
        #         entities.append(ent.text)
        # text = " ".join(entities)
        
        # Correct typos
        text = str(TextBlob(text).correct())
        
        # Remove numbers
        text = re.sub(r'\d+', '', text)
        
        # remove punctuation
        text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
        
        # remove double spaces
        text = re.sub(' +', ' ', text)
        
        # Tokenize text
        words = word_tokenize(text)
        # Remove stopwords and lemmatize
        words = [self.lemmatizer.lemmatize(
            word) for word in words if word not in self.stop_words]
        
        return ' '.join(words)
