In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mamoo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mamoo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mamoo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
file_path = 'train.csv'
data = pd.read_csv(file_path)

In [4]:
print(data)

      essay_id                                          full_text  score
0      000d118  Many people have car where they live. The thin...      3
1      000fe60  I am a scientist at NASA that is discussing th...      3
2      001ab80  People always wish they had the same technolog...      4
3      001bdc0  We all heard about Venus, the planet without a...      4
4      002ba53  Dear, State Senator\n\nThis is a letter to arg...      3
...        ...                                                ...    ...
17302  ffd378d  the story " The Challenge of Exploing Venus " ...      2
17303  ffddf1f  Technology has changed a lot of ways that we l...      4
17304  fff016d  If you don't like sitting around all day than ...      2
17305  fffb49b  In "The Challenge of Exporing Venus," the auth...      1
17306  fffed3e  Venus is worthy place to study but dangerous. ...      2

[17307 rows x 3 columns]


In [5]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [6]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

In [7]:
data['processed_text'] = data['full_text'].apply(preprocess_text)

In [8]:
data[['essay_id', 'processed_text', 'score']].head()

Unnamed: 0,essay_id,processed_text,score
0,000d118,many people car live thing dont know use car a...,3
1,000fe60,scientist nasa discussing face mar explaining ...,3
2,001ab80,people always wish technology seen movie best ...,4
3,001bdc0,heard venus planet without almost oxygen earth...,4
4,002ba53,dear state senator letter argue favor keeping ...,3


In [9]:
data= data[['essay_id', 'processed_text', 'score']]

In [10]:
print(data)

      essay_id                                     processed_text  score
0      000d118  many people car live thing dont know use car a...      3
1      000fe60  scientist nasa discussing face mar explaining ...      3
2      001ab80  people always wish technology seen movie best ...      4
3      001bdc0  heard venus planet without almost oxygen earth...      4
4      002ba53  dear state senator letter argue favor keeping ...      3
...        ...                                                ...    ...
17302  ffd378d  story challenge exploing venus informative pie...      2
17303  ffddf1f  technology changed lot way live today nowadays...      4
17304  fff016d  dont like sitting around day great opportunity...      2
17305  fffb49b  challenge exporing venus author suggests study...      1
17306  fffed3e  venus worthy place study dangerous reaosn thei...      2

[17307 rows x 3 columns]


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_features=5000)
tfidf_representation = tfidf_vectorizer.fit_transform(data['processed_text'])
tfidf_representation_dense = tfidf_representation.toarray()
print(tfidf_representation_dense.shape)


(17307, 5000)


In [12]:
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
print(tfidf_feature_names[:150])

['10' '100' '11' '114' '118' '12' '13' '14' '147' '15' '156' '16' '17'
 '170' '18' '1800s' '1888' '19' '1900s' '1940s' '1945' '1947' '1950s'
 '1960' '1960s' '1968' '1976' '1980s' '1990s' '1992' '1995' '1997' '1998'
 '20' '200' '2000' '2001' '2001a' '2005' '2006' '2009' '2012' '2012s'
 '2013' '2015' '2016' '2020' '21' '21st' '22' '22euro' '23' '23rd' '24'
 '247' '25' '25mph' '266' '27' '270' '271' '28' '29' '2nd' '30' '300'
 '301' '31' '335' '34' '35' '3687' '370' '38' '39' '3d' '40' '4000'
 '40000' '41' '41971' '43' '44' '50' '500000' '51' '513' '51998' '533'
 '538' '55' '5500' '5559' '57' '5th' '60' '617' '67' '70' '797' '80' '800'
 '82001' '83' '8th' '90' '97' '98' 'abd' 'ability' 'able' 'aboard'
 'abolish' 'abolished' 'abolishing' 'absolute' 'absolutely' 'absolutly'
 'absurd' 'abundance' 'abundant' 'abuse' 'academic' 'accelerate'
 'accelerating' 'accept' 'accepted' 'access' 'accessible' 'accident'
 'accidently' 'accomplish' 'accomplished' 'accomplishment' 'according'
 'accordingly' 