In [1]:
#preprocess task with nlp

import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

#NLTK RESOURCES
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

# DATA PATH
content_df = pd.read_csv(r'C:\Users\nh013\Desktop\Learning Equality - Curriculum Recommendations\content.csv')
correlations_df = pd.read_csv(r'C:\Users\nh013\Desktop\Learning Equality - Curriculum Recommendations\correlations.csv')
topics_df = pd.read_csv(r'C:\Users\nh013\Desktop\Learning Equality - Curriculum Recommendations\topics.csv')

# IDENTIFY MISSING VALUES
missing_values_content = content_df.isnull().sum()
missing_values_correlations = correlations_df.isnull().sum()
missing_values_topics = topics_df.isnull().sum()

# DROP MISSING VALUES
content_df.dropna(inplace=True)
correlations_df.dropna(inplace=True)
topics_df.dropna(inplace=True)

# REMOVE DUPLICATE ROWS
content_df.drop_duplicates(inplace=True)
correlations_df.drop_duplicates(inplace=True)
topics_df.drop_duplicates(inplace=True)

# NLP PREPROCESSING FUNCTION
def preprocess_text(text):
    if isinstance(text, str):
        tokens = word_tokenize(text)
        tokens = [token for token in tokens if token.isalnum()]
        tokens = [token.lower() for token in tokens]
        stop_words = set(stopwords.words("english"))
        tokens = [token for token in tokens if token not in stop_words]
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
        return " ".join(tokens)
    else:
        return ""

# NLP PREPROCESSING IN CONTENT_DF
content_df['description'] = content_df['description'].apply(preprocess_text)
content_df['text'] = content_df['text'].apply(preprocess_text)

#NLP PREPROCESSING IN TOPICS_DF
topics_df['title'] = topics_df['title'].apply(preprocess_text)
topics_df['description'] = topics_df['description'].apply(preprocess_text)
topics_df['channel'] = topics_df['channel'].apply(preprocess_text)
topics_df['category'] = topics_df['category'].apply(preprocess_text)


print("Preprocessed content_df:")
print(content_df.head())

print("Preprocessed topics_df:")
print(topics_df.head())


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Preprocessed content_df:
                id                                         title  \
3   c_0000c03adc8d                           Nado de aproximação   
4   c_00016694ea2a              geometry-m3-topic-a-overview.pdf   
12  c_0005765779c8                         Ендомембранна система   
14  c_00068709797c  Kuhesabu vizio mraba kutafuta kanuni ya eneo   
17  c_0006f51dc8e7                                    Lección 13   

                                          description      kind  \
3   neste vídeo você vai aprender nado de aproxima...  document   
4                                                      document   
12  преглед на мембранните структури които образув...     video   
14  sal anatumia kizio mraba kuona kwa nini kuzidi...     video   
17  objetivo multiplicar factores numéricos mixtos...  document   

                                                 text language  \
3   nado de aproximação saber nadar na ondas sem p...       pt   
4   estándares comunes del esta

In [3]:
# perform tfidf vectorization

import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

# NLTK RESOURCES
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

#DATA PATH
content_df = pd.read_csv(r'C:\Users\nh013\Desktop\Learning Equality - Curriculum Recommendations\content.csv')
correlations_df = pd.read_csv(r'C:\Users\nh013\Desktop\Learning Equality - Curriculum Recommendations\correlations.csv')
topics_df = pd.read_csv(r'C:\Users\nh013\Desktop\Learning Equality - Curriculum Recommendations\topics.csv')

# IDENTIFY MISSING VALUES
missing_values_content = content_df.isnull().sum()
missing_values_correlations = correlations_df.isnull().sum()
missing_values_topics = topics_df.isnull().sum()

# DROP MISSING VALUES
content_df.dropna(inplace=True)
correlations_df.dropna(inplace=True)
topics_df.dropna(inplace=True)

# REMOVE DUPLICATE ROWS
content_df.drop_duplicates(inplace=True)
correlations_df.drop_duplicates(inplace=True)
topics_df.drop_duplicates(inplace=True)

# NLP PREPROCESSING FUNCTION
def preprocess_text(text):
    if isinstance(text, str):
        tokens = word_tokenize(text)
        tokens = [token for token in tokens if token.isalnum()]
        tokens = [token.lower() for token in tokens]
        stop_words = set(stopwords.words("english"))
        tokens = [token for token in tokens if token not in stop_words]
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
        return " ".join(tokens)
    else:
        return ""

# NLP PREPROCESSING IN CONTENT_DF
content_df['description'] = content_df['description'].apply(preprocess_text)
content_df['text'] = content_df['text'].apply(preprocess_text)

# NLP PREPROCESSING IN TOPICS_DF
topics_df['title'] = topics_df['title'].apply(preprocess_text)
topics_df['description'] = topics_df['description'].apply(preprocess_text)
topics_df['channel'] = topics_df['channel'].apply(preprocess_text)
topics_df['category'] = topics_df['category'].apply(preprocess_text)

# COMBINE TEXT COLUMN FOR  TF-IDF vectorization
content_df['combined_text'] = content_df['description'] + " " + content_df['text']
topics_df['combined_text'] = topics_df['title'] + " " + topics_df['description'] + " " + topics_df['channel'] + " " + topics_df['category']

# TF-IDF VECTORIZATION
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
tfidf_combined_content = tfidf_vectorizer.fit_transform(content_df['combined_text'])
tfidf_combined_topics = tfidf_vectorizer.transform(topics_df['combined_text'])

# CONVERT TF-IDF MATRICS TO DATAFRAME 
tfidf_combined_df_content = pd.DataFrame(tfidf_combined_content.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
tfidf_combined_df_topics = pd.DataFrame(tfidf_combined_topics.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# CONCATINATE TF-IDF MATRICS TO DATAFRAME
content_df = pd.concat([content_df, tfidf_combined_df_content], axis=1)
topics_df = pd.concat([topics_df, tfidf_combined_df_topics], axis=1)


print("Preprocessed content_df:")
print(content_df.head())

print("Preprocessed topics_df:")
print(topics_df.head())


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Preprocessed content_df:
                id                                         title  \
3   c_0000c03adc8d                           Nado de aproximação   
4   c_00016694ea2a              geometry-m3-topic-a-overview.pdf   
12  c_0005765779c8                         Ендомембранна система   
14  c_00068709797c  Kuhesabu vizio mraba kutafuta kanuni ya eneo   
17  c_0006f51dc8e7                                    Lección 13   

                                          description      kind  \
3   neste vídeo você vai aprender nado de aproxima...  document   
4                                                      document   
12  преглед на мембранните структури които образув...     video   
14  sal anatumia kizio mraba kuona kwa nini kuzidi...     video   
17  objetivo multiplicar factores numéricos mixtos...  document   

                                                 text language  \
3   nado de aproximação saber nadar na ondas sem p...       pt   
4   estándares comunes del esta

In [6]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# NLTK RESOURCES
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

# DATA PATH 
content_df = pd.read_csv(r'C:\Users\nh013\Desktop\Learning Equality - Curriculum Recommendations\content.csv')
correlations_df = pd.read_csv(r'C:\Users\nh013\Desktop\Learning Equality - Curriculum Recommendations\correlations.csv')
topics_df = pd.read_csv(r'C:\Users\nh013\Desktop\Learning Equality - Curriculum Recommendations\topics.csv')

# IDENTIFY MISSING VALUES
missing_values_content = content_df.isnull().sum()
missing_values_correlations = correlations_df.isnull().sum()
missing_values_topics = topics_df.isnull().sum()

# DROP MISSING VALUES
content_df.dropna(inplace=True)
correlations_df.dropna(inplace=True)
topics_df.dropna(inplace=True)

# REMOVE DUPLICATE ROWS
content_df.drop_duplicates(inplace=True)
correlations_df.drop_duplicates(inplace=True)
topics_df.drop_duplicates(inplace=True)

# NLP PREPROCESSING FUNCTION
def preprocess_text(text):
    if isinstance(text, str):
        tokens = word_tokenize(text)
        tokens = [token for token in tokens if token.isalnum()]
        tokens = [token.lower() for token in tokens]
        stop_words = set(stopwords.words("english"))
        tokens = [token for token in tokens if token not in stop_words]
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
        return " ".join(tokens)
    else:
        return ""

# NLP PREPROCESSING IN CONTENT_DF
content_df['description'] = content_df['description'].apply(preprocess_text)
content_df['text'] = content_df['text'].apply(preprocess_text)

# NLP PREPROCESSING IN TOPICS_DF
topics_df['title'] = topics_df['title'].apply(preprocess_text)
topics_df['description'] = topics_df['description'].apply(preprocess_text)
topics_df['channel'] = topics_df['channel'].apply(preprocess_text)
topics_df['category'] = topics_df['category'].apply(preprocess_text)

# COMBINE TEXT COLUMN FOR  TF-IDF vectorization
content_df['combined_text'] = content_df['description'] + " " + content_df['text']
topics_df['combined_text'] = topics_df['title'] + " " + topics_df['description'] + " " + topics_df['channel'] + " " + topics_df['category']

# TF-IDF VECTORIZATION
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
tfidf_combined_content = tfidf_vectorizer.fit_transform(content_df['combined_text'])
tfidf_combined_topics = tfidf_vectorizer.transform(topics_df['combined_text'])

# CONVERT TF-IDF MATRICES TO DATAFRAME
tfidf_combined_df_content = pd.DataFrame(tfidf_combined_content.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
tfidf_combined_df_topics = pd.DataFrame(tfidf_combined_topics.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# CONCATINATE TF-IDF MATRICES TO DATAFRAME 
content_df = pd.concat([content_df, tfidf_combined_df_content], axis=1)
topics_df = pd.concat([topics_df, tfidf_combined_df_topics], axis=1)


# SELECT COLUMN FOR CONTENT DATAFRAME 
content_columns = ['id', 'title', 'description', 'kind', 'text', 'language', 'copyright_holder', 'license']
content_data = content_df[content_columns]

# SELECT COLUMN FOR TOPICS DATAFRAME
topics_columns = ['id', 'title', 'description', 'channel', 'category', 'level', 'language', 'parent', 'has_content']
topics_data = topics_df[topics_columns]

# SPLIT CONTENT DATA FOR TRAINING AND TESTING SET
content_train, content_test = train_test_split(content_data, test_size=0.2, random_state=42)

# SPLIT TOPICS DATA FOR TRAINING AND TESTING SET
topics_train, topics_test = train_test_split(topics_data, test_size=0.2, random_state=42)


print("Content Train Shape:", content_train.shape)
print("Content Test Shape:", content_test.shape)
print("Topics Train Shape:", topics_train.shape)
print("Topics Test Shape:", topics_test.shape)



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Content Train Shape: (57624, 14)
Content Test Shape: (14407, 14)
Topics Train Shape: (43189, 13)
Topics Test Shape: (10798, 13)


In [4]:
#perfrom rnn model 

import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

#NLTK RESOURCES
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

# DATA PATH
content_df = pd.read_csv(r'C:\Users\nh013\Desktop\Learning Equality - Curriculum Recommendations\content.csv')
correlations_df = pd.read_csv(r'C:\Users\nh013\Desktop\Learning Equality - Curriculum Recommendations\correlations.csv')
topics_df = pd.read_csv(r'C:\Users\nh013\Desktop\Learning Equality - Curriculum Recommendations\topics.csv')

# IDENTIFY MISSING VALUES
missing_values_content = content_df.isnull().sum()
missing_values_correlations = correlations_df.isnull().sum()
missing_values_topics = topics_df.isnull().sum()

# DROP MISSING VALUES
content_df.dropna(inplace=True)
correlations_df.dropna(inplace=True)
topics_df.dropna(inplace=True)

# REMOVE DUPLICATE ROWS
content_df.drop_duplicates(inplace=True)
correlations_df.drop_duplicates(inplace=True)
topics_df.drop_duplicates(inplace=True)

# NLP PREPROCESSING FUNCTION
def preprocess_text(text):
    if isinstance(text, str):
        tokens = word_tokenize(text)
        tokens = [token for token in tokens if token.isalnum()]
        tokens = [token.lower() for token in tokens]
        stop_words = set(stopwords.words("english"))
        tokens = [token for token in tokens if token not in stop_words]
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
        return " ".join(tokens)
    else:
        return ""

# NLP PREPROCESSING FOR CONTENT_DF
content_df['description'] = content_df['description'].apply(preprocess_text)
content_df['text'] = content_df['text'].apply(preprocess_text)

# NLP PREPROCESSING FOR TOPICS_DF
topics_df['title'] = topics_df['title'].apply(preprocess_text)
topics_df['description'] = topics_df['description'].apply(preprocess_text)
topics_df['channel'] = topics_df['channel'].apply(preprocess_text)
topics_df['category'] = topics_df['category'].apply(preprocess_text)

# COMBINE TEXT COLUMN FOR TF-IDF VECTORIZATION
content_df['combined_text'] = content_df['description'] + " " + content_df['text']
topics_df['combined_text'] = topics_df['title'] + " " + topics_df['description'] + " " + topics_df['channel'] + " " + topics_df['category']

# REBUILD 'combined_text' COLUMN AFTER DROPPING MISSING  VALUES
content_df['combined_text'] = content_df['combined_text'].apply(preprocess_text)
topics_df['combined_text'] = topics_df['combined_text'].apply(preprocess_text)

# COMBINE CONTENT AND TOPICS DATA 
combined_data = content_df['combined_text'].tolist() + topics_df['combined_text'].tolist()
combined_labels = np.concatenate((np.zeros(len(content_df)), np.ones(len(topics_df))))

# SPLIT  COMBINED DATA INTO TRAINING AND TESTING SET 
train_data, test_data, train_labels, test_labels = train_test_split(combined_data, combined_labels, test_size=0.2, random_state=42)

# TOKENIZE AND PAD SEQUENCE
tokenizer = Tokenizer(num_words=1000, oov_token="<OOV>")
tokenizer.fit_on_texts(train_data)
train_sequences = tokenizer.texts_to_sequences(train_data)
test_sequences = tokenizer.texts_to_sequences(test_data)
train_padded = pad_sequences(train_sequences, maxlen=100, padding='post', truncating='post')
test_padded = pad_sequences(test_sequences, maxlen=100, padding='post', truncating='post')

# BUILD RNN MODEL
model = Sequential([
    Embedding(input_dim=1000, output_dim=32, input_length=100),
    LSTM(64),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# TRAIN THE MODEL 
model.fit(train_padded, train_labels, epochs=5, batch_size=32)

# EVALUATE THE MODEL
test_loss, test_accuracy = model.evaluate(test_padded, test_labels)
print("Test Accuracy:", test_accuracy)






[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test Accuracy: 0.9708129167556763
