In [1]:
import pandas as pd

In [2]:
reviews = pd.read_csv('Group1_Influenster.csv')

In [3]:
reviews = reviews.text.tolist()

In [4]:
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords 
import nltk
nltk.download("stopwords")
from nltk.stem import PorterStemmer
from collections import Counter

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mathildeduverger/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


CREATE A FUNCTION

In [5]:
def clean_stem_token(text):
    # output
    clean_stem_token = []
    
    # delete all punctuation from text and upper cases
    text = text.lower()
    transfo = {a:" " for a in ["@","/",".","#",",","'","-","!","?"]}
    text = text.translate(str.maketrans(transfo))
    
    # tokenize text
    tkzer = TweetTokenizer(preserve_case=False,strip_handles=True)
    tokens = tkzer.tokenize(text)
    
    #delete stopwords
    stop_words = set(stopwords.words("english"))
    tokens = [token for token in tokens if token.lower() not in stop_words]
    
    #stem tokens
    ps = PorterStemmer()
    for w in tokens:
        clean_stem_token.append(ps.stem(w))
        
    return(clean_stem_token)
    

In [6]:
reviews_tok = [clean_stem_token(t) for t in reviews]

In [7]:
reviews_tok

[['realli',
  'upset',
  'edit',
  'iphon',
  'estheticli',
  'beauti',
  'like',
  'previou',
  'edit',
  'iphon',
  'eleg',
  'especi',
  'posit',
  'camera',
  'secondli',
  'fragil',
  'get',
  'crush',
  'easili',
  'strong',
  'point',
  'camera',
  'qualiti',
  'amaz',
  'processor',
  'develop'],
 ['el',
  'precio',
  'es',
  'muy',
  'alto',
  'en',
  'comparación',
  'su',
  'rival',
  'quien',
  'logró',
  'destronarlo',
  'en',
  'meno',
  'de',
  'un',
  'me',
  'con',
  'la',
  'posición',
  'de',
  'mejor',
  'camara',
  'con',
  'un',
  'costo',
  'casi',
  'al',
  'dobl',
  'del',
  'googl',
  'pixel',
  'el',
  'nuevo',
  'modelo',
  'xl',
  '2',
  'es',
  'quien',
  'llegó',
  'para',
  'derribar',
  'est',
  'gigant',
  'de',
  'la',
  'comunicación',
  'celular',
  'inclus',
  'la',
  'portabilidad',
  'del',
  'pixel',
  'es',
  'mejor'],
 ['’',
  'new',
  'iphon',
  'x',
  'around',
  'month',
  'overal',
  '’',
  'pleas',
  'expect',
  'surround',
  'phone',
  '

FUNCTIONS FOR TF IDF


In [8]:
import math

In [27]:
def computeTF(wordDict,bow):
    tfDict = {}
    n = len(bow)
    tfDict ={k: v / n for k, v in wordDict.items()}
    
    return tfDict

In [28]:
def computeIDF(DictList):
    idfDict = {}
    N = len(DictList)
    
    idfDict = dict.fromkeys(DictList[0].keys(),0)
    for doc in DictList:
        for word, val in doc.items():
            if val>0 :
                idfDict[word] += 1
    for word, val in idfDict.items():
        idfDict[word] = math.log(N/float(val))
    
    return idfDict

In [29]:
def computeTFIDF(bowList):
    
    # create a set with all the words that appear in every bag of words
    wordSet = set()
    for bow in bowList :
        # we use | to apply union to the sets
        wordSet |= set(bow)
    
    # create a dictionnary from the above set, initalising values at 0
    wordDic = dict.fromkeys(wordSet,0)
    
    # DictList is the list of the complete dictionnaries above for every bag of words
    DictList = []
    
    # tfDicts is the list of all dictionnaries for every bag of word where each value is the number
    # of occurences of the word in the bag of words divided by the number of words in the
    # bag of words
    tfDicts = []
    
    for bow in bowList :
        
        bowDic = wordDic.copy()
        # Counter counts the occurences of each word in the bag of words
        bowDic.update(dict(Counter(bow)))
        
        tfDict = computeTF(bowDic,bow)
        
        tfDicts.append(tfDict)
        DictList.append(bowDic)
    
    # idfDict is the dictionnary of all words in every bag of words where the value of 
    # a word is equal to the log(n the number of bow/ the number of documents in which the 
    # word appears)
    idfDict = computeIDF(DictList)
        
    result = []
    for subtf in tfDicts:
        tfidf = {}
        tfidf = {k: v*idfDict[k] for k, v in subtf.items()}
        result.append(tfidf)
    
    return result
    

TEST

In [41]:
text = ["This is the first document","This is the second document"]

In [42]:
test = [clean_stem_token(t) for t in text]

In [43]:
test

[['first', 'document'], ['second', 'document']]

In [44]:
TFIDF = computeTFIDF(test)

In [45]:
TFIDF

[{'document': 0.0, 'first': 0.34657359027997264, 'second': 0.0},
 {'document': 0.0, 'first': 0.0, 'second': 0.34657359027997264}]