## SPELLING CORRECTION 

### Step 1: Preprocess data in a Swahili dataset:

In [None]:
import pandas as pd
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# Read the CSV file into a pandas dataframe
df = pd.read_csv('swahili_spelling.csv')

# Define a function to preprocess the text
def preprocess_text(text):
    # Remove any non-letter character using regex
    text = re.sub('[^a-zA-Z]', ' ', text)
    # Convert text to lowercase
    text = text.lower()
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    # Join the tokens back into a string
    text = ' '.join(tokens)
    return text

# Apply the function to the 'text' column of the dataframe
df['text'] = df['text'].apply(preprocess_text)

# Save the cleaned dataframe to a new CSV file
df.to_csv('cleaned_swahili_spelling.csv', index=False)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Step 2: Build a language model that can identify the most likely spelling corrections for each word in the tokenized Swahili text:

In [None]:
import pandas as pd
import re
import nltk
nltk.download('punkt')
from nltk.util import ngrams
from collections import Counter

# Read the CSV file into a pandas dataframe
df = pd.read_csv('swahili_spelling.csv')

# Define a function to preprocess the text
def preprocess_text(text):
    # Remove any non-letter character using regex
    text = re.sub('[^a-zA-Z]', ' ', text)
    # Convert text to lowercase
    text = text.lower()
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    return tokens

# Apply the function to the 'text' column of the dataframe
df['tokens'] = df['text'].apply(preprocess_text)

# Define a function to train a n-gram model on the tokenized text
def train_ngram_model(tokens, n):
    # Create a list of n-grams from the tokenized text
    ngrams_list = list(ngrams(tokens, n))
    # Count the frequency of each n-gram
    ngrams_count = Counter(ngrams_list)
    # Calculate the probability of each n-gram
    ngrams_prob = {}
    for ngram, count in ngrams_count.items():
        context = ' '.join(ngram[:-1])
        if context not in ngrams_prob:
            ngrams_prob[context] = {}
        ngrams_prob[context][ngram[-1]] = count / sum(ngrams_count[ngram[:n-1] + (w,)] for w in tokens)
    return ngrams_prob

# Train a trigram model on the entire dataset
trigram_model = {}
for tokens in df['tokens']:
    trigram_model.update(train_ngram_model(tokens, 3))

# Define a function to correct the spelling of a token using the trigram model
def correct_spelling(token, model):
    context = ' '.join(token[:-1])
    if context in model:
        candidates = model[context]
        correction = max(candidates, key=candidates.get)
        return correction
    else:
        return token

# Define a function to correct the spelling of a list of tokens using the trigram model
def correct_tokens(tokens, model):
    corrected_tokens = [correct_spelling(token, model) for token in tokens]
    return corrected_tokens

# Apply the function to the 'tokens' column of the dataframe
df['corrected_tokens'] = df['tokens'].apply(lambda x: correct_tokens(x, trigram_model))

# Save the cleaned dataframe to a new CSV file
df.to_csv('corrected_swahili_spelling.csv', index=False)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Step 3: Training a machine learning model for spelling correction: 

In [None]:
import pandas as pd
import re
import nltk
nltk.download('punkt')
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

# Read the CSV file into a pandas dataframe
df = pd.read_csv('corrected_swahili_spelling.csv')

# Define a function to preprocess the text
def preprocess_text(text):
    # Remove any non-letter character using regex
    text = re.sub('[^a-zA-Z]', ' ', text)
    # Convert text to lowercase
    text = text.lower()
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    return ' '.join(tokens)

# Apply the function to the 'text' column of the dataframe
df['preprocessed_text'] = df['text'].apply(preprocess_text)

# Define the input and output variables for the machine learning model
X = df['preprocessed_text']
y = df['corrected_tokens']

# Define a pipeline that includes a count vectorizer and a decision tree classifier
pipeline = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1, 3))),
    ('classifier', DecisionTreeClassifier())
])

# Train the machine learning model on the preprocessed text and the corrected tokens
pipeline.fit(X, y)

# Save the trained model to a pickle file
import pickle
with open('swahili_spelling_correction_model.pkl', 'wb') as f:
    pickle.dump(pipeline, f)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


KeyboardInterrupt: 

### Step 4: Test the trained model on a separate dataset of Swahili text:

In [None]:
import pandas as pd
import re
import nltk
nltk.download('punkt')
import pickle

# Load the trained model from the pickle file
with open('swahili_spelling_correction_model.pkl', 'rb') as f:
    model = pickle.load(f)

# Read the test dataset from a CSV file into a pandas dataframe
df_test = pd.read_csv('corrected_swahili_spelling.csv')

# Define a function to preprocess the text for the test dataset
def preprocess_text(text):
    # Remove any non-letter character using regex
    text = re.sub('[^a-zA-Z]', ' ', text)
    # Convert text to lowercase
    text = text.lower()
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    return ' '.join(tokens)

# Apply the function to the 'text' column of the test dataframe
df_test['preprocessed_text'] = df_test['text'].apply(preprocess_text)

# Use the trained model to predict corrected tokens for the test dataset
y_pred = model.predict(df_test['preprocessed_text'])

# Compute the accuracy of the model
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(df_test['corrected_tokens'], y_pred)
print('Accuracy:', accuracy)

# Compute the precision, recall, and F1 score of the model
from sklearn.metrics import classification_report
print(classification_report(df_test['corrected_tokens'], y_pred))


                                                                                                                                                                                                                                                                           ['si', 'yenye', 'kutisha', 'sana', 'na', 'isiyofahamika', 'sana']       1.00      1.00      1.00         1
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 ['si', 'ziara', 'ya', 'kila', 'juma', 'bali', 'kw

# SENTIMENT ANALYZER

### Step 1: Preprocess data in a Swahili dataset:

In [None]:
import pandas as pd
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# Read the CSV file into a pandas dataframe
df = pd.read_csv('swahili_sentiment.csv')

# Define a function to preprocess the text
def preprocess_text(text):
    # Remove any non-letter character using regex
    text = re.sub('[^a-zA-Z]', ' ', text)
    # Convert text to lowercase
    text = text.lower()
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    # Join the tokens back into a string
    text = ' '.join(tokens)
    return text

# Apply the function to the 'text' column of the dataframe
df['text'] = df['text'].apply(preprocess_text)

# Save the cleaned dataframe to a new CSV file
df.to_csv('cleaned_swahili_sentiment.csv', index=False)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Step 2: Extract features from processed data

In [10]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec

# Load the CSV file into a pandas DataFrame
df = pd.read_csv('cleaned_swahili_sentiment.csv')

# Replace NaNs with an empty string
df = df.replace(np.nan, '', regex=True)

# Extract bag-of-words features
vectorizer = CountVectorizer()
bow_features = vectorizer.fit_transform(df['text'])

# Extract TF-IDF features
tfidf_vectorizer = TfidfVectorizer()
tfidf_features = tfidf_vectorizer.fit_transform(df['text'])

# Train a Word2Vec model on the preprocessed text
sentences = [text.split() for text in df['text']]
w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# Extract Word2Vec features for each sentence
w2v_features = []
for sentence in sentences:
    sentence_vec = []
    for word in sentence:
        if word in w2v_model.wv.key_to_index:
            sentence_vec.append(w2v_model.wv[word])
    w2v_features.append(np.mean(sentence_vec, axis=0))
w2v_features = np.array(w2v_features)

# Save the features to separate CSV files
np.savetxt('bow_features.csv', bow_features.toarray(), delimiter=',')
np.savetxt('tfidf_features.csv', tfidf_features.toarray(), delimiter=',')


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  w2v_features = np.array(w2v_features)


### Step 3: Train a sentiment analysis model using the bag-of-words features and the Naive Bayes algorithm:

In [11]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Load the CSV file into a pandas DataFrame
df = pd.read_csv('cleaned_swahili_sentiment.csv')

# Replace NaNs with an empty string
df = df.replace(np.nan, '', regex=True)

# Extract bag-of-words features
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['text'])

# Split the data into training and test sets
y = df['labels']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Naive Bayes classifier on the training set
clf = MultinomialNB()
clf.fit(X_train, y_train)

# Evaluate the classifier on the test set
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")


Accuracy: 0.7847133757961784


### Step 4: Evaluate the performance of the model using the accuracy score and 10-fold cross-validation:

In [12]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Load the CSV file into a pandas DataFrame
df = pd.read_csv('cleaned_swahili_sentiment.csv')

# Replace NaNs with an empty string
df = df.replace(np.nan, '', regex=True)

# Extract bag-of-words features
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['text'])

# Split the data into training and test sets
y = df['labels']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Naive Bayes classifier on the training set
clf = MultinomialNB()
clf.fit(X_train, y_train)

# Evaluate the classifier using cross-validation
scores = cross_val_score(clf, X, y, cv=10)
print(f"Cross-validation scores: {scores}")
print(f"Mean cross-validation score: {np.mean(scores)}")

# Evaluate the classifier on the test set
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")


Cross-validation scores: [0.8346056  0.85496183 0.72010178 0.76590331 0.70992366 0.78826531
 0.78316327 0.76785714 0.71428571 0.70153061]
Mean cross-validation score: 0.7640598224022435
Accuracy: 0.7847133757961784


## HANDWRITING RECOGNITION

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=9c6da334-c0ce-4d5d-91f7-232702f78e3c' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>