<a href="https://colab.research.google.com/github/oxidiovega/ML_DL_templates/blob/main/NLP__TEMPLATE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.stem import SnowballStemmer
from nltk.stem.snowball import EnglishStemmer
from nltk.stem.porter import *
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from sklearn.metrics.pairwise import linear_kernel
import warnings



In [None]:
df=pd.read_csv("") #insert the path to your csv file 
df = df.dropna() # dropping NaN values
df.head() # checking the first 5 values 

In [None]:
#now it's time to make a wordcloud to vizualize the text
text_column=" " #the name of the column that contains your text data
label_column=" " #the name of your labels column
stopwords = set(STOPWORDS) #we convert the list of stopwords to a set because looking up an element in a set is faster 
text = " ".join(review for review in df[''])#insert the clomun name where your text is located



In [None]:

# we generate the wordcloud
wordcloud = WordCloud(stopwords=stopwords, background_color="white").generate(text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
stopwords.update([" "," "]) #this is to add the other common stopwords that you deduce from the Visualization

In [None]:
df[text_column] = df[text_column].str.lower()  #we make sure that all the words are in low-case

df[text_column]=df[text_column].apply(lambda row:word_tokenize(row)) #we tokenize our sentences into words

In [None]:
df[text_column]=df[text_column].apply(lambda row:[w for w in row if  w.isalpha()]) #to clean the text from the html tags,and non-alphanumerical data
df[text_column]=df[text_column].apply(lambda row:[w for w in row if  not w in stopwords]) #we remove the stopwords

In [None]:
snow_stemmer = SnowballStemmer(language='english')
lemmatizer = WordNetLemmatizer()
nltk.download('wordnet')

In [None]:
df[text_column]=df[text_column].apply(lambda row : [lemmatizer.lemmatize(word) for word in row ]) #lemmetization version

In [None]:
df[text_column] = df[text_column].apply(lambda row: [snow_stemmer.stem(word) for word in row]) # stemming version

In [None]:
df[text_column]=ds[text_column].apply(lambda x:" ".join(x)) # we join the strings back into one single string so that we're able to apply TF-IDF

In [None]:
df.to_csv("dataset_clean.csv", encoding='utf-8', index=False) # in order to save the work already done

In [None]:
X_train,X_test,y_train,y_test=train_test_split(df[text_column],df[label_column],test_size=0.3) # we prepare the training /test data to train our model


In [None]:
# we use gridsearch to get the best parameters for our model
from sklearn.model_selection import GridSearchCV
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LinearSVC()),
])
parameters = {
    'tfidf__max_df': (0.25,0.35,0.5, 0.75), # you can add more parameters here
    'tfidf__min_df': [5,10,15,20,25,50],
    
    
}

grid_search_tune = GridSearchCV(pipeline, parameters, cv=2, n_jobs=2, verbose=3)
grid_search_tune.fit(X_train,y_train)

print("Best parameters set:")
print (grid_search_tune.best_estimator_.steps)

In [None]:
tf=TfidfVectorizer(min_df=,max_df=) # we pick the best parameters based on the grid search result and create an instant of tfidf

In [None]:
# we make a pipeline object for our model
pipeline_tf=Pipeline([('tf-idf',tf),
           ('clf',LinearSVC()) #this is the model that we're going to use , you can replace it with Knn or any other classifier
          
          ])
model_tf=pipeline_tf.fit(X_train,y_train)
predicted_tf=model_tf.predict(X_test)
report_tf=classification_report(y_test,predicted_tf) #we finally get our results 
print(report_tf)


now we're going to experiment with cosine distance and looking for similaire sentences 

In [None]:
# we're going to use this function to preprocess our sentence
def preprocess_phrase(phrase):
  
  
  phrase=phrase.lower() # we lower the case
  phrase=phrase.split() # this is to tokenize the sentence
  ls=[]

  for i in phrase: # and to remove non alphabetical words and stopwords
    if(i.isalpha() and i not in stopwords):
      ls.append(lemmatizer.lemmatize(i)) # we lemmatize the word 

  ls=" ".join(ls)
  return ls

preprocess_phrase(" try typing stuff here /*/+.")


In [None]:
def get_similaire_tf(phrase):
  preprocess_phrase(phrase)
  new_row = {'text_column':phrase} # we make a row out of our phrase
  new_df=df.append(new_row,ignore_index=True) # we append our own phrase to the dataframe and make a copy of the entire dataframe
  X = tf.fit_transform(new_df[text_column]) #we apply tf-idf
  cosine_similarities = linear_kernel(X[-1:],X).flatten() # linear_kernel will give us the cosine distance between the sentence we added and every row of our dataframe 
  #and flatten will turn it into a 1D array
  related_docs_indices = cosine_similarities.argsort()[:-6:-1] # this is to sort the 5 most similaire results from our distance array
  related_docs_indices
  phrases=[]
  for i in related_docs_indices:
    phrases.append(df[text_column][i])
  return phrases
  
# the idea is to append the phrase ( our input ) to a copy of the dataset and reclaculate the tf-idf and then to use the similarity between the last element( our input) and
#the whole dataset and finally sort through the top 5 indices





In [None]:
get_similaire_tf(" insert your new sentence here") # try putting some text in this