In [17]:
import string 
import re 
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')

from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gmjsl\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Defining function to clean the text
def clean_text(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [8]:
# tokenize
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
def tokenize(text):
    text = tokenizer.tokenize(text)
    return text

In [9]:
# Remove stopwords
def remove_stopwords(text):
    words = [word for word in text if word not in stopwords.words('english')]
    return words

In [10]:
# Lemmatize
lemmatizer = WordNetLemmatizer()
def lemmatize(text):
    text = [lemmatizer.lemmatize(token) for token in text]
    return text

In [11]:
# text preprocessing functions 
def text_preprocessing(text):
    # Clean up text
    nopunc = clean_text(text)
    
    # Tokenize
    tokenized_text = tokenize(nopunc)
    
    # Remove stopwords
    removed_stopwords_text = remove_stopwords(tokenized_text)
    
    # Lemmatize
    lemmatized_text = lemmatize(removed_stopwords_text)
    
    combined_text = ' '.join(lemmatized_text)
    
    return combined_text

In [18]:
def get_tfidf_vectors(essays):
    vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_df=0.9)
    
    tfidf_vectors = vectorizer.fit_transform(essays)
    
    print(tfidf_vectors)
    
    return pd.DataFrame(tfidf_vectors.toarray(), columns=vectorizer.get_feature_names())

In [23]:
import pandas as pd

In [24]:
train_set  = pd.read_csv('data/train.tsv', sep='\t', encoding = "ISO-8859-1")\
             .rename(columns={'Score1': 'EssayScore'})
train_set = train_set[['Id', 'EssaySet', 'EssayText', 'EssayScore']]

In [26]:
train_set_sample = train_set.head(3)

In [27]:
train_set_sample['EssayText'] = train_set_sample['EssayText'].apply(lambda x: text_preprocessing(x))
train_set_sample

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_set_sample['EssayText'] = train_set_sample['EssayText'].apply(lambda x: text_preprocessing(x))


Unnamed: 0,Id,EssaySet,EssayText,EssayScore
0,1,1,additional information would need replicate ex...,1
1,2,1,reading expirement realized additional informa...,1
2,3,1,need trial control set exact amount vinegar po...,1


In [28]:
get_tfidf_vectors(train_set_sample['EssayText'])

  (0, 131)	0.11176931709391921
  (0, 63)	0.11176931709391921
  (0, 126)	0.11176931709391921
  (0, 162)	0.11176931709391921
  (0, 171)	0.11176931709391921
  (0, 41)	0.11176931709391921
  (0, 91)	0.11176931709391921
  (0, 129)	0.11176931709391921
  (0, 38)	0.11176931709391921
  (0, 61)	0.11176931709391921
  (0, 83)	0.11176931709391921
  (0, 86)	0.11176931709391921
  (0, 160)	0.11176931709391921
  (0, 151)	0.11176931709391921
  (0, 24)	0.11176931709391921
  (0, 67)	0.11176931709391921
  (0, 104)	0.11176931709391921
  (0, 164)	0.11176931709391921
  (0, 93)	0.11176931709391921
  (0, 50)	0.11176931709391921
  (0, 119)	0.11176931709391921
  (0, 95)	0.11176931709391921
  (0, 174)	0.11176931709391921
  (0, 72)	0.11176931709391921
  (0, 4)	0.11176931709391921
  :	:
  (2, 31)	0.14812230607912677
  (2, 34)	0.14812230607912677
  (2, 106)	0.14812230607912677
  (2, 165)	0.14812230607912677
  (2, 12)	0.14812230607912677
  (2, 46)	0.14812230607912677
  (2, 133)	0.14812230607912677
  (2, 28)	0.148122306

Unnamed: 0,accurate,additional,additional information,additional information need,additional information would,also,also take,also take check,amant,amant vinegar,...,would need,would need replicate,write,write conclusion,write conclusion make,yar,yar expirement,yar expirement three,yar result,yar result accurate
0,0.0,0.085003,0.085003,0.0,0.111769,0.0,0.0,0.0,0.0,0.0,...,0.111769,0.111769,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.113079,0.085999,0.085999,0.113079,0.0,0.0,0.0,0.0,0.113079,0.113079,...,0.0,0.0,0.113079,0.113079,0.113079,0.226158,0.113079,0.113079,0.113079,0.113079
2,0.0,0.0,0.0,0.0,0.0,0.148122,0.148122,0.148122,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
