In [1]:
from datasets import load_dataset
import pandas as pd

In [2]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split 
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier

In [3]:
from datasets import get_dataset_config_names

configs = get_dataset_config_names("imdb")
print(configs)

['plain_text']


In [4]:
datasets = load_dataset('imdb','plain_text')

Found cached dataset imdb (/home/omranian/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)


  0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [6]:
train = datasets['train'].to_pandas()
train

Unnamed: 0,text,label
0,I rented I AM CURIOUS-YELLOW from my video sto...,0
1,"""I Am Curious: Yellow"" is a risible and preten...",0
2,If only to avoid making this type of film in t...,0
3,This film was probably inspired by Godard's Ma...,0
4,"Oh, brother...after hearing about this ridicul...",0
...,...,...
24995,A hit at the time but now better categorised a...,1
24996,I love this movie like no other. Another time ...,1
24997,This film and it's sequel Barry Mckenzie holds...,1
24998,'The Adventures Of Barry McKenzie' started lif...,1


In [7]:
test = datasets['test'].to_pandas()
test

Unnamed: 0,text,label
0,I love sci-fi and am willing to put up with a ...,0
1,"Worth the entertainment value of a rental, esp...",0
2,its a totally average film with a few semi-alr...,0
3,STAR RATING: ***** Saturday Night **** Friday ...,0
4,"First off let me say, If you haven't enjoyed a...",0
...,...,...
24995,Just got around to seeing Monster Man yesterda...,1
24996,I got this as part of a competition prize. I w...,1
24997,I got Monster Man in a box set of three films ...,1
24998,"Five minutes in, i started to feel how naff th...",1


In [8]:
train.text

0        I rented I AM CURIOUS-YELLOW from my video sto...
1        "I Am Curious: Yellow" is a risible and preten...
2        If only to avoid making this type of film in t...
3        This film was probably inspired by Godard's Ma...
4        Oh, brother...after hearing about this ridicul...
                               ...                        
24995    A hit at the time but now better categorised a...
24996    I love this movie like no other. Another time ...
24997    This film and it's sequel Barry Mckenzie holds...
24998    'The Adventures Of Barry McKenzie' started lif...
24999    The story centers around Barry McKenzie who mu...
Name: text, Length: 25000, dtype: object

In [9]:
test.text[0]

'I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn\'t match the background, and painfully one-dimensional characters cannot be overcome with a \'sci-fi\' setting. (I\'m sure there are those of you out there who think Babylon 5 is good sci-fi TV. It\'s not. It\'s clichéd and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It\'s really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. Their actions and reactions are wooden and predictable, often painful to watch. The makers of Earth KNOW it\'s rubbish as they have

In [10]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/omranian/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [11]:
# function to clean data
import string
import itertools 
import re
from nltk.stem import WordNetLemmatizer
from string import punctuation
from nltk.stem import PorterStemmer

stops = ['the','a','an','and','but','if','or','because','as','what','which','this','that','these','those','then',
              'just','so','than','such','both','through','about','for','is','of','while','during','to','What','Which',
              'Is','If','While','This']

def cleanData(text, lowercase = False, remove_stops = False, stemming = False, lemmatization = False):
    txt = str(text)
    
    # Replace apostrophes with standard lexicons
    txt = txt.replace("isn't", "is not")
    txt = txt.replace("aren't", "are not")
    txt = txt.replace("ain't", "am not")
    txt = txt.replace("won't", "will not")
    txt = txt.replace("didn't", "did not")
    txt = txt.replace("shan't", "shall not")
    txt = txt.replace("haven't", "have not")
    txt = txt.replace("hadn't", "had not")
    txt = txt.replace("hasn't", "has not")
    txt = txt.replace("don't", "do not")
    txt = txt.replace("wasn't", "was not")
    txt = txt.replace("weren't", "were not")
    txt = txt.replace("doesn't", "does not")
    txt = txt.replace("'s", " is")
    txt = txt.replace("'re", " are")
    txt = txt.replace("'m", " am")
    txt = txt.replace("'d", " would")
    txt = txt.replace("'ll", " will")
    
    # More cleaning
    txt = re.sub(r"\'s", " ", txt)
    txt = re.sub(r"\'ve", " have ", txt)
    txt = re.sub(r"can't", "cannot ", txt)
    txt = re.sub(r"n't", " not ", txt)
    txt = re.sub(r"I'm", "I am", txt)
    txt = re.sub(r" m ", " am ", txt)
    txt = re.sub(r"\'re", " are ", txt)
    txt = re.sub(r"\'d", " would ", txt)
    txt = re.sub(r"\'ll", " will ", txt)
    txt = re.sub(r" e g ", " eg ", txt)
    txt = re.sub(r"\0s", "0", txt)

    # Remove urls and emails
    txt = re.sub(r'^https?:\/\/.[\r\n]', ' ', txt, flags=re.MULTILINE)
    txt = re.sub(r'[\w\.-]+@[\w\.-]+', ' ', txt, flags=re.MULTILINE)
    
    # Remove punctuation from text
    txt = ''.join([c for c in text if c not in punctuation])

    
    # Remove all symbols
    txt = re.sub(r'[^A-Za-z0-9\s]',r' ',txt)
    txt = re.sub(r'\n',r' ',txt)
    
    txt = re.sub(r'[0-9]',r' ',txt)
    
    # Replace words like sooooooo with so
    txt = ''.join(''.join(s)[:2] for _, s in itertools.groupby(txt))
    
    # lowercase and Split (instead of tokenization, so no need to word tokenization later:-)  attached words   
    if lowercase:
        txt = " ".join([w.lower() for w in txt.split()])
        
    if remove_stops:
        txt = " ".join([w for w in txt.split() if w not in stops])
    if stemming:
        st = PorterStemmer()
#         print (len(txt.split()))
#         print (txt)
        txt = " ".join([st.stem(w) for w in txt.split()])
    
    if lemmatization:
        wordnet_lemmatizer = WordNetLemmatizer()
        txt = " ".join([wordnet_lemmatizer.lemmatize(w, pos='v') for w in txt.split()])

    return txt

In [12]:
train.text = train['text'].map(lambda x: cleanData(x, lowercase=True, remove_stops=True, stemming=True, lemmatization = True))
train.text

0        i rent i be curiousyellow from my video store ...
1        i be curiou yellow risibl pretenti steam pile ...
2        onli avoid make type film in futur film intere...
3        film wa probabl inspir by godard masculin f mi...
4        oh brotheraft hear ridicul film umpteen year a...
                               ...                        
24995    hit at time now better categoris australian cu...
24996    i love movi like no other anoth time i will tr...
24997    film it sequel barri mckenzi hold hi own be tw...
24998    adventur barri mckenzi start life satir comic ...
24999    stori center around barri mckenzi who must go ...
Name: text, Length: 25000, dtype: object

In [13]:
train.text[0]

'i rent i be curiousyellow from my video store all controversi surround it when it wa first releas in i also hear at first it wa seiz by us custom it ever tri enter countri therefor be fan film consid controversi i realli have see myselfbr br plot center around young swedish drama student name lena who want learn everyth she can life in particular she want focu her attent make some sort documentari on averag swede think certain polit issu vietnam war race issu in unit state in between ask politician ordinari denizen stockholm their opinion on polit she ha sex with her drama teacher classmat marri menbr br kill me i be curiousyellow year ago wa consid pornograph realli sex nuditi scene be few far between even it not shoot like some cheapli make porno my countrymen mind find it shock in realiti sex nuditi be major stapl in swedish cinema even ingmar bergman arguabl their answer good old boy john ford have sex scene in hi filmsbr br i do commend filmmak fact ani sex show in film show arti

In [14]:
test.text = test['text'].map(lambda x: cleanData(x, lowercase=True, remove_stops=True, stemming=True, lemmatization = True))
test.text

0        i love scifi be will put up with lot scifi mov...
1        worth entertain valu rental especi you like ac...
2        it total averag film with few semialright acti...
3        star rate saturday night friday night friday m...
4        first off let me say you havent enjoy van damm...
                               ...                        
24995    get around see monster man yesterday it have b...
24996    i get part competit prize i watch it not reall...
24997    i get monster man in box set three film where ...
24998    five minut in i start feel how naff wa look yo...
24999    i catch movi on scifi channel recent it actual...
Name: text, Length: 25000, dtype: object

In [15]:
# Load library
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

In [16]:
# Create the bag of words feature matrix
count = CountVectorizer()
bag_of_words = count.fit_transform(train.text)

bag_of_words

<25000x88562 sparse matrix of type '<class 'numpy.int64'>'
	with 2932741 stored elements in Compressed Sparse Row format>

In [17]:
bag_of_words_test = count.fit_transform(test.text)
test_set = bag_of_words_test

In [18]:
features = bag_of_words
features

<25000x88562 sparse matrix of type '<class 'numpy.int64'>'
	with 2932741 stored elements in Compressed Sparse Row format>

In [19]:
target = train['label']

In [20]:
 # Create random forest classifier object
randomforest = RandomForestClassifier(random_state=0, n_estimators=1000, n_jobs=-1)

In [21]:
# Train model
model = randomforest.fit(features, target)

In [None]:
# training evaluation: Cross-validate model using accuracy
cross_val_score(model, features, target, scoring='accuracy')

In [None]:
# model evaluation: Cross-validate model using accuracy
cross_val_score(model, test_set, target, scoring='accuracy')