In [12]:
# Import the necessary libraries
import tensorflow_datasets as tfds
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from spellchecker import SpellChecker
import pandas as pd
import re

# Load the dataset
data = tfds.load('imdb_reviews', split=['train', 'test'])

# Convert the data into pandas DataFrame and decode bytes to string
train_df = tfds.as_dataframe(data[0])
test_df = tfds.as_dataframe(data[1])

# Decode bytes to string
train_df['text'] = train_df['text'].apply(lambda x: x.decode('utf-8'))
test_df['text'] = test_df['text'].apply(lambda x: x.decode('utf-8'))

# Initialize a spell checker
spell = SpellChecker()

def correct_spelling(text):
    corrected_text = []
    misspelled_words = spell.unknown(text.split())
    for word in text.split():
        if word in misspelled_words:
            correction = spell.correction(word)
            if correction is not None:  # Check if the correction is not None
                corrected_text.append(correction)
            else:
                corrected_text.append(word)  # If correction is None, append original word
        else:
            corrected_text.append(word)
    return " ".join(corrected_text)


# Correct spelling
train_df['text'] = train_df['text']
test_df['text'] = test_df['text']

# Preprocess the data
vectorizer = CountVectorizer(stop_words='english')
X_train = vectorizer.fit_transform(train_df['text'])
X_test = vectorizer.transform(test_df['text'])

y_train = train_df['label']
y_test = test_df['label']

# Create a classifier
clf = MultinomialNB()

# Train the classifier
clf.fit(X_train, y_train)

# Suppose you have 50 new samples for testing, which are stored in a CSV file "test_samples.csv"
# We'll read the file into a DataFrame and preprocess it in the same way as the training data

with open('test_data.csv', 'r') as f:
    test_data = f.readlines()

test_data = [line.strip('\n') for line in test_data]
test_samples_df = pd.DataFrame(test_data, columns=['text'])

# Apply correct spelling to 'text' column of DataFrame
test_samples_df['text'] = test_samples_df['text'].apply(correct_spelling)

X_test_samples = vectorizer.transform(test_samples_df['text'])

# Use the trained classifier to make predictions on your test samples
predictions = clf.predict(X_test_samples)


In [13]:
print(predictions)

[0 1 1 0 1 0 1 1 0 1 0 1 1 0 1 1 1 0 0 0 1 1 0 1 1 1 0 0 0 1 1 1 0 0 1 1 0
 0 1 1 0 0 0 1 1 0 1 0 1 1]


In [14]:
print(classification_report(y_train[:50], predictions))

              precision    recall  f1-score   support

           0       0.64      0.47      0.54        30
           1       0.43      0.60      0.50        20

    accuracy                           0.52        50
   macro avg       0.53      0.53      0.52        50
weighted avg       0.55      0.52      0.52        50



# Classifier: Logistic
# Spellchecker: 50 Testdaten
# Stopwords (English / Vectorizer)
#              precision    recall  f1-score   support
#
#           0       0.64      0.47      0.54        30
#           1       0.43      0.60      0.50        20
#
#    accuracy                           0.52        50
#   macro avg       0.53      0.53      0.52        50
# weighted avg       0.55      0.52      0.52        50