In [31]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [33]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [99]:
 pip install -U textblob

Note: you may need to restart the kernel to use updated packages.


In [100]:
pip install textstat

Collecting textstat
  Downloading textstat-0.7.3-py3-none-any.whl (105 kB)
     -------------------------------------- 105.1/105.1 kB 1.5 MB/s eta 0:00:00
Collecting pyphen
  Downloading pyphen-0.13.2-py3-none-any.whl (2.0 MB)
     ---------------------------------------- 2.0/2.0 MB 1.2 MB/s eta 0:00:00
Installing collected packages: pyphen, textstat
Successfully installed pyphen-0.13.2 textstat-0.7.3
Note: you may need to restart the kernel to use updated packages.


In [2]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import pandas as pd
import numpy as np
from textblob import TextBlob
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import cmudict
import textstat

In [3]:
def preprocess_text(text):
    # Remove punctuation and special characters
    text = re.sub(r'[^\w\s]', '', text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Stem the words
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    
    # Join the tokens
    preprocessed_text = " ".join(stemmed_tokens)
    
    return preprocessed_text

# Load the texts data
texts_df = pd.read_csv('texts.csv')

# Preprocess the texts
texts_df['text'] = texts_df['text'].apply(preprocess_text)

texts_df

Unnamed: 0,id,text
0,400,young peopl return ballroom present decidedli ...
1,401,dinner time mr fayr somewhat silent eye rest d...
2,402,roger predict snow depart quickli came two day...
3,403,mr grime come next morn sir john harthov place...
4,404,outsid palac great garden wall round fill full...
...,...,...
4719,8027,name monarch mean king adult monarch butterfli...
4720,8028,walk stick long thin slowmov bug look like sti...
4721,8029,black widow shini black spider orang red mark ...
4722,8030,solid shape actual touch three dimens mean len...


In [4]:
def text_length(text):
    # Tokenize text
    tokens = nltk.word_tokenize(text)
    
    # Return the number of tokens
    return len(tokens)

def vocabulary_complexity(text):
    # Split the text into tokens
    tokens = nltk.word_tokenize(text)
    
    # Get the unique tokens
    unique_tokens = set(tokens)
    
    # Return the number of unique tokens divided by the total number of tokens
    return len(unique_tokens) / len(tokens)

def readability_scores(text):
    # Calculate the readability scores
    readability_scores = TextBlob(text).sentiment
    
    # Extract the subjectivity scores
    subjectivity = readability_scores.subjectivity
    
    return subjectivity

def get_pos_frequencies(row, column):
    pos_frequencies = {}
    text = row[column]
    tokens = nltk.word_tokenize(text)
    pos_tags = nltk.pos_tag(tokens)
    for tag in pos_tags:
        if tag[1] in pos_frequencies:
            pos_frequencies[tag[1]] += 1
        else:
            pos_frequencies[tag[1]] = 1
    return pos_frequencies

def flesch_kincaid(text):
    # Split the text into words
    words = nltk.word_tokenize(text)
    num_words = len(words)
    
    # Split the text into sentences
    sentences = nltk.sent_tokenize(text)
    num_sentences = len(sentences)
    
    # Count the number of syllables in the text
    num_syllables = textstat.syllable_count(text)
    
    # Calculate the Flesch-Kincaid readability score
    score = 206.835 - 1.015 * (num_words / num_sentences) - 84.6 * (num_syllables / num_words)
    
    return score

In [5]:
pos_tags = ['JJ', 'VBP', 'RB']

def add_pos_features(df, text_column, pos_tags):
    for index, row in df.iterrows():
        pos_frequencies = get_pos_frequencies(row, text_column)
        # Convert the POS frequencies to a numpy array and reshape it for scaling
        pos_frequencies_array = np.array(list(pos_frequencies.values())).reshape(-1, 1)
        scaler = MinMaxScaler()
        # Fit the scaler to the POS frequencies and transform them
        pos_frequencies_scaled = scaler.fit_transform(pos_frequencies_array)
        # Add the normalized POS frequencies to the data frame
        for i, pos in enumerate(pos_frequencies.keys()):
            if pos in pos_tags:
                df.loc[index, pos] = pos_frequencies_scaled[i]
    return df

texts_df = add_pos_features(texts_df, 'text', pos_tags)
texts_df

Unnamed: 0,id,text,JJ,VBP,RB
0,400,young peopl return ballroom present decidedli ...,0.326531,0.020408,0.020408
1,401,dinner time mr fayr somewhat silent eye rest d...,0.435897,0.128205,0.051282
2,402,roger predict snow depart quickli came two day...,0.416667,0.194444,0.194444
3,403,mr grime come next morn sir john harthov place...,0.410256,0.000000,0.025641
4,404,outsid palac great garden wall round fill full...,0.358491,0.018868,0.018868
...,...,...,...,...,...
4719,8027,name monarch mean king adult monarch butterfli...,0.666667,0.000000,0.153846
4720,8028,walk stick long thin slowmov bug look like sti...,0.673913,0.043478,0.173913
4721,8029,black widow shini black spider orang red mark ...,0.571429,0.040816,0.081633
4722,8030,solid shape actual touch three dimens mean len...,0.435897,0.076923,0.000000


In [6]:
scores = texts_df['text'].apply(lambda text: flesch_kincaid(text))

scaler = MinMaxScaler()

scaler.fit(scores.values.reshape(-1, 1))

scores_scaled = scaler.transform(scores.values.reshape(-1, 1))

texts_df['flesch_kincaid_scaled'] = scores_scaled

# Add text_length to df
text_length = texts_df['text'].apply(lambda text: text_length(text))
scaler.fit(text_length.values.reshape(-1, 1))
text_length_scaled = scaler.transform(text_length.values.reshape(-1, 1))
texts_df['text_length_scaled'] = text_length_scaled

# Add vocab complexity to df
vocabulary_complexity = texts_df['text'].apply(lambda text: vocabulary_complexity(text))
texts_df['vocabulary_complexity'] = vocabulary_complexity

# Add readability_scores to df
readability_scores = texts_df['text'].apply(lambda text: readability_scores(text))
texts_df['readability_scores'] = readability_scores

# Show data
texts_df

Unnamed: 0,id,text,JJ,VBP,RB,flesch_kincaid_scaled,text_length_scaled,vocabulary_complexity,readability_scores
0,400,young peopl return ballroom present decidedli ...,0.326531,0.020408,0.020408,0.565850,0.451220,0.831461,0.300000
1,401,dinner time mr fayr somewhat silent eye rest d...,0.435897,0.128205,0.051282,0.557634,0.439024,0.840909,0.523294
2,402,roger predict snow depart quickli came two day...,0.416667,0.194444,0.194444,0.623386,0.390244,0.857143,0.464502
3,403,mr grime come next morn sir john harthov place...,0.410256,0.000000,0.025641,0.748750,0.268293,0.851351,0.498214
4,404,outsid palac great garden wall round fill full...,0.358491,0.018868,0.018868,0.622547,0.475610,0.868132,0.639583
...,...,...,...,...,...,...,...,...,...
4719,8027,name monarch mean king adult monarch butterfli...,0.666667,0.000000,0.153846,0.531306,0.621951,0.592233,0.287708
4720,8028,walk stick long thin slowmov bug look like sti...,0.673913,0.043478,0.173913,0.457675,0.731707,0.660714,0.333356
4721,8029,black widow shini black spider orang red mark ...,0.571429,0.040816,0.081633,0.460257,0.646341,0.638095,0.305303
4722,8030,solid shape actual touch three dimens mean len...,0.435897,0.076923,0.000000,0.564425,0.243902,0.777778,0.217969


In [7]:
texts_df = texts_df.fillna(0)

In [29]:
# Read train, validation and test data
train_df = pd.read_csv('train.csv')
validation_df = pd.read_csv('validation.csv')
test_df = pd.read_csv('test.csv')

# Feature list
features = ['flesch_kincaid_scaled', 'text_length_scaled', 'vocabulary_complexity', 'readability_scores', 'JJ', 'VBP', 'RB']

# Create a dictionary that will contain an id and the tuples of features
dictionary_tt = dict()
for i, f, t, v, r, j, vb, rb in zip(texts_df['id'], texts_df['flesch_kincaid_scaled'], texts_df['text_length_scaled'],
                         texts_df['vocabulary_complexity'], texts_df['readability_scores'], texts_df['JJ'],
                         texts_df['VBP'], texts_df['RB']):
    dictionary_tt[i] = (f, t, v, r, j, vb, rb)

# Value association between data frames (using ID's) with the dictionary (features)
for index, value in enumerate(features):
    train_df[value + "_A"] = train_df['Text A'].apply(lambda x: dictionary_tt[x][index])
    train_df[value + "_B"] = train_df['Text B'].apply(lambda x: dictionary_tt[x][index])
    validation_df[value + "_A"] = validation_df['Text A'].apply(lambda x: dictionary_tt[x][index])
    validation_df[value + "_B"] = validation_df['Text B'].apply(lambda x: dictionary_tt[x][index])
    test_df[value + "_A"] = test_df['Text A'].apply(lambda x: dictionary_tt[x][index])
    test_df[value + "_B"] = test_df['Text B'].apply(lambda x: dictionary_tt[x][index])
    
column_order = ['flesch_kincaid_scaled_A', 'vocabulary_complexity_A', 'readability_scores_A', 'text_length_scaled_A', 'JJ_A', 'VBP_A', 'RB_A', 'flesch_kincaid_scaled_B', 'vocabulary_complexity_B', 'readability_scores_B', 'text_length_scaled_B', 'JJ_B', 'VBP_B', 'RB_B','Text A', 'Text B', 'Result']

# Reindex the dataframe with the new column order
train_df = train_df.reindex(columns=column_order).drop(['Text A', 'Text B'], axis=1)
validation_df = validation_df.reindex(columns=column_order).drop(['Text A', 'Text B'], axis=1)
test_df = test_df.reindex(columns=column_order).drop(['Text A', 'Text B'], axis=1)


# Show data
test_df

Unnamed: 0,flesch_kincaid_scaled_A,vocabulary_complexity_A,readability_scores_A,text_length_scaled_A,JJ_A,VBP_A,RB_A,flesch_kincaid_scaled_B,vocabulary_complexity_B,readability_scores_B,text_length_scaled_B,JJ_B,VBP_B,RB_B,Result
0,0.771456,0.855072,0.318519,0.207317,0.289474,0.000000,0.052632,0.571872,0.887640,0.394318,0.451220,0.229167,0.062500,0.062500,
1,0.771456,0.855072,0.318519,0.207317,0.289474,0.000000,0.052632,0.307715,0.625000,0.057955,0.536585,0.365385,0.019231,0.038462,
2,0.771456,0.855072,0.318519,0.207317,0.289474,0.000000,0.052632,0.532334,0.612500,0.505556,0.341463,0.369565,0.043478,0.043478,
3,0.771456,0.855072,0.318519,0.207317,0.289474,0.000000,0.052632,0.518425,0.958904,0.379630,0.256098,0.333333,0.023810,0.000000,
4,0.771456,0.855072,0.318519,0.207317,0.289474,0.000000,0.052632,0.557634,0.897727,0.446910,0.439024,0.500000,0.052632,0.052632,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4340,0.294493,0.778846,0.312963,0.634146,0.366667,0.083333,0.000000,0.289622,0.750000,0.373571,0.439024,0.232143,0.035714,0.000000,
4341,0.409325,0.682927,0.540000,0.365854,0.363636,0.068182,0.000000,0.467607,0.684211,0.298160,0.524390,0.458333,0.104167,0.104167,
4342,0.126189,0.674157,0.366667,0.451220,0.238095,0.015873,0.015873,0.762692,0.888889,0.350000,0.134146,0.258065,0.161290,0.096774,
4343,0.546580,0.857143,0.555556,0.304878,0.285714,0.095238,0.071429,0.567928,0.942857,0.276190,0.219512,0.333333,0.027778,0.055556,


In [18]:
results_counts = train_df.groupby('Result').size().reset_index(name='counts')
results_counts

Unnamed: 0,Result,counts
0,0,18908
1,1,20031


In [17]:
train_df.loc[train_df['Result'] > 1, 'Result'] = 1

In [42]:
# Import necessary libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Hyperparameter grid
param_grid = {'n_estimators': [100, 200, 300],
              'max_depth': [10, 20, 30]}

# Input and output variables
X_train = train_df.drop(columns=['Result'])
y_train = train_df['Result']

# Input and output variables for the validation and test data
X_validate = validation_df.drop(columns=['Result'])
y_validate = validation_df['Result']

# The model
model = RandomForestClassifier()

# Set up the grid search
grid_search = GridSearchCV(model, param_grid, cv=5)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_

best_model = RandomForestClassifier(**best_params)

best_model.fit(X_train, y_train)

# Evaluate the model on the validation data
accuracy = best_model.score(X_validate, y_validate)
print(f'Validation accuracy: {accuracy:.2f}')


Validation accuracy: 0.63


In [28]:
X_test = test_df.drop(columns=['Result'])
y_test = test_df['Result']

# Make predictions on the test data
y_pred = best_model.predict(X_test)

test_df['Result'] = y_pred

test_df.to_csv('test_predictions.csv', index=False)

In [41]:
predictions_df = pd.read_csv('test_predictions.csv')
predictions_result_df = predictions_df['Result']
final_test_df = pd.read_csv('test.csv')
drop_result_df = final_test_df.drop(columns=['Result'])
final_df = drop_result_df.merge(predictions_result_df, right_index=True, left_index=True, how='right')
final_df.to_csv('final_predictions.csv', index=False)