In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


# import what we need

from nltk.tokenize import word_tokenize
from sklearn import linear_model
from sklearn import metrics
from sklearn import model_selection
from sklearn.feature_extraction.text import CountVectorizer

if __name__ == '__main__':
    #read the training data
    df = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
    
    # map positive to 1 and negative to 0
    df.sentiment = df.sentiment.apply(
        lambda x: 1 if x == 'positive' else 0
    )
    
    # we create a new column called kfold and fill it with -1
    df['kfold'] = -1
    
    # the next spet is to randomize the rows of the data
    df = df.sample(frac=1).reset_index(drop=True)
    
    # fetch labels
    y = df.sentiment.values
    
    # initiate the kfold class from model_selection module
    kf = model_selection.StratifiedKFold(n_splits = 5)
    
    #fill the new kfold column
    for f, (t_, v_) in enumerate(kf.split(X=df, y=y)):
        df.loc[v_, 'kfold'] = f
    
    # we go over the folds created
    for fold_ in range(5):
        # temporary dataframes for train and test
        train_df = df[df.kfold != fold_].reset_index(drop = True)
        test_df = df[df.kfold == fold_].reset_index(drop = True)
        
        #initialize CountVectorizer with NLTK's word_tokenize
        #funciton as tokenize
        count_vec = CountVectorizer(
            tokenizer = word_tokenize,
            token_pattern = None
        )
        
        # fit count_vec on training data reviews
        count_vec.fit(train_df.review)
        
        #transform training and validation data reviews
        xtrain = count_vec.transform(train_df.review)
        xtest = count_vec.transform(test_df.review)
        
        #initialize the logistic regression model
        model = linear_model.LogisticRegression()
        
        #fit the model on training data reviews and sentiment
        model.fit(xtrain, train_df.sentiment)
        
        #make predictions on the test data, threshold for predictions is 0.5
        preds = model.predict(xtest)
        
        #calculate accuracy
        accuracy = metrics.accuracy_score(test_df.sentiment, preds)
        
        print(f'Fold: {fold_}')
        print(f'Accuracy: {accuracy}')
        print(' ')
    
    
    
    