# Description

This notebook is to explore the data generated by translating the titles and descriptions to English.

In [1]:
import data
import data_vectorizer
import nltk

# Number of text without spaces

In [None]:
data.topics.title_translate = data.topics.title_translate.map(data_vectorizer.transform_replace_line_breaks)
data.topics.description_translate = data.topics.description_translate.map(data_vectorizer.transform_replace_line_breaks)
data.contents.title_translate = data.contents.title_translate.map(data_vectorizer.transform_replace_line_breaks)
data.contents.description_translate = data.contents.description_translate.map(data_vectorizer.transform_replace_line_breaks)

def has_space(x):
    if type(x) == float:
        return False
    return x.find(" ") == -1

def count_space(x):
    if type(x) == float:
        return False
    return x.count(" ")

display(data.topics.loc[data.topics["title_translate"].map(has_space) & data.topics["description_translate"].map(has_space)])
display(data.contents.loc[data.contents["title_translate"].map(has_space) & data.contents["description_translate"].map(has_space)])

Seems like the text without any spaces are files or some source_id strings.

In [None]:
count = 2
display(data.topics.loc[(data.topics["title_translate"].map(count_space) == count) & (data.topics["description_translate"].map(count_space) == count)])
display(data.contents.loc[(data.contents["title_translate"].map(count_space) == count) & (data.contents["description_translate"].map(count_space) == count)])

By inspecting count == 1 and count == 2, it seems better to replace "-" and "\_" with whitespaces to account for file names.

In [None]:
data.topics.title_translate = data.topics.title_translate.map(data_vectorizer.transform_replace_symbols)
data.topics.description_translate = data.topics.description_translate.map(data_vectorizer.transform_replace_symbols)
data.contents.title_translate = data.contents.title_translate.map(data_vectorizer.transform_replace_symbols)
data.contents.description_translate = data.contents.description_translate.map(data_vectorizer.transform_replace_symbols)

count = 1
display(data.topics.loc[data.topics["title_translate"].map(has_space) & data.topics["description_translate"].map(has_space)])
display(data.contents.loc[data.contents["title_translate"].map(has_space) & data.contents["description_translate"].map(has_space)])
display(data.topics.loc[(data.topics["title_translate"].map(count_space) == count) & (data.topics["description_translate"].map(count_space) == count)])
display(data.contents.loc[(data.contents["title_translate"].map(count_space) == count) & (data.contents["description_translate"].map(count_space) == count)])

# Attempt for applying lemmatizer to vectorize

In [None]:
print(data_vectorizer.lemmatize_sentence(data.topics.loc["t_00068291e9a4", "description_translate"]))

In [None]:
display(data.topics)

# Generate word frequency

In [None]:
import time
import pandas as pd
words_freq = dict()

def add_list_to_words_freq(mlist):
    for word in mlist:
        if word not in words_freq:
            words_freq[word] = 1
        else:
            words_freq[word] = words_freq[word] + 1

def obtain_lemmas(x):
    title = x["title_translate"]
    description = x["description_translate"]
    
    if type(title) != float:
        lems_title = data_vectorizer.lemmatize_sentence(title)
        add_list_to_words_freq(lems_title)
        
    if type(description) != float:
        lems_description = data_vectorizer.lemmatize_sentence(description)
        add_list_to_words_freq(lems_description)

ctime = time.time()
for index, row in data.topics.iterrows():
    obtain_lemmas(row)
ctime = time.time() - ctime
print("Time elapsed: ",ctime)

ctime = time.time()
for index, row in data.contents.iterrows():
    obtain_lemmas(row)
ctime = time.time() - ctime
print("Time elapsed: ",ctime)

# convert to series
words_freq = pd.Series(words_freq).sort_values(ascending = False)

words_frame = pd.DataFrame(index = range(len(words_freq)))
words_frame["word"] = words_freq.index
words_frame["frequency"] = words_freq.values
words_frame.to_csv("data/word_freqs.csv")

In [None]:
display(data_vectorizer.word_freqs)

In [None]:
import pandas as pd
inverse = pd.DataFrame(index = data_vectorizer.word_freqs["word"], data = data_vectorizer.word_freqs.index, columns = ["pos"])

In [None]:
display(inverse["pos"].loc[inverse.index.intersection(["use","be", "rwrfeg"])])

In [None]:
data_vectorizer.vectorize("geometry-m3-topic-a-overview.pdf")

# Obtain a list of learnable words

In [None]:
def is_learnable(x):
    title_vector = data_vectorizer.vectorize(x["title_translate"])
    desc_vector = data_vectorizer.vectorize(x["description_translate"])
    return len(title_vector) + len(desc_vector) > 0

learnable_topics = data.topics.apply(is_learnable, axis = 1)
learnable_contents = data.contents.apply(is_learnable, axis = 1)
learnable_topics.to_csv("data/learnable_topics.csv")
learnable_contents.to_csv("data/learnable_contents.csv")

# Generate the vectorizations

In [8]:
topics = data.topics.copy()
contents = data.contents.copy()
topics["title_vectorize"] = topics["title_translate"].apply(data_vectorizer.vectorize)
topics["description_vectorize"] = topics["description_translate"].apply(data_vectorizer.vectorize)
contents["title_vectorize"] = contents["title_translate"].apply(data_vectorizer.vectorize)
contents["description_vectorize"] = contents["description_translate"].apply(data_vectorizer.vectorize)

topics.to_csv("data/topics_translate_vectorize.csv")
contents.to_csv("data/contents_translate_vectorize.csv")

In [7]:
topics["title_translate"].apply(data_vectorizer.vectorize)

id
t_00004da3a1b2          []
t_000095e03056        [14]
t_00068291e9a4    [8, 896]
t_00069b63a70a      [2051]
t_0006d41a73a8          []
Name: title_translate, dtype: object

In [4]:
display(data.topics)

Unnamed: 0_level_0,title,description,channel,category,level,language,parent,has_content,title_translate,description_translate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
t_00004da3a1b2,Откриването на резисторите,"Изследване на материали, които предизвикват на...",000cf7,source,4,bg,t_16e29365b50d,True,,
t_000095e03056,Unit 3.3 Enlargements and Similarities,,b3f329,aligned,2,en,t_aa32fb6252dc,False,Unit 3.3 Enlargements and Similarities,
t_00068291e9a4,Entradas e saídas de uma função,Entenda um pouco mais sobre funções.,8e286a,source,4,pt,t_d14b6c2a2b70,True,Inputs and outputs of a function,Understand a little more about functions.
t_00069b63a70a,Transcripts,,6e3ba4,source,3,en,t_4054df11a74e,True,Transcripts,
t_0006d41a73a8,Графики на експоненциални функции (Алгебра 2 н...,Научи повече за графиките на сложните показате...,000cf7,source,4,bg,t_e2452e21d252,True,,
...,...,...,...,...,...,...,...,...,...,...
t_fffb0bf2801d,4.3 Graph of functions,,e77b55,aligned,4,en,t_676e6a1a4dc7,False,4.3 Graph of functions,
t_fffbe1d5d43c,Inscribed shapes problem solving,Use properties of inscribed angles to prove pr...,0c929f,source,4,sw,t_50145b9bab3f,True,,
t_fffe14f1be1e,Lección 7,,6e90a7,aligned,6,es,t_d448c707984d,True,Lesson 7,
t_fffe811a6da9,تحديد العلاقة بين الإحداثيّات القطبية والإحداث...,5b9e5ca86571f90499ea987f,9fd860,source,2,ar,t_5b4f3ba4eb7d,True,Identification of the relationship between Arc...,5b9e5ca86571f90499ea987f
