### Explorative Data Analysis

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud

data_folder = '../../data/'

Reading and merge training and test set processed

In [None]:
df1 = pd.read_csv(data_folder + 'processed/train-set-cat1-processed.csv')
df2 = pd.read_csv(data_folder + 'processed/test-set-cat1-processed.csv')
df = pd.concat([df1, df2], ignore_index=True)

Counting the number of articles for each label (topic/category of the article)

In [None]:
df.groupby(['label']).size().reset_index(name='counts')

In [None]:
print(f"There are {len(df)} ArXiv articles.")
print(f"There are {len(df['label'].unique())} categories in the dataset.")

Bi-grams analysis: most frequently used bi-grams

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

word_vectorizer = CountVectorizer(ngram_range=(2,2), analyzer='word')
sparse_matrix = word_vectorizer.fit_transform(df['text'])
frequencies = sparse_matrix.sum(axis=0).T
df_bigrams = pd.DataFrame(frequencies, index=word_vectorizer.get_feature_names_out(), columns=['frequency'])
df_bigrams.sort_values(by='frequency', ascending=False).head(10)

Tri-grams analysis: most frequently used tri-grams

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
word_vectorizer = CountVectorizer(ngram_range=(3,3), analyzer='word')
sparse_matrix = word_vectorizer.fit_transform(df['text'])
frequencies = sparse_matrix.sum(axis=0).T
df_trigrams = pd.DataFrame(frequencies, index=word_vectorizer.get_feature_names_out(), columns=['frequency'])
df_trigrams.sort_values(by='frequency', ascending=False).head(10)

TF-IDF analysis with bi-grams

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
word_vectorizer = TfidfVectorizer(ngram_range=(2,2), analyzer='word')
sparse_matrix = word_vectorizer.fit_transform(df['text'])
tfidf = sparse_matrix.sum(axis=0).T
df_trigrams_tfidf = pd.DataFrame(tfidf, index=word_vectorizer.get_feature_names_out(), columns=['tfidf'])
df_trigrams_tfidf.sort_values(by='tfidf', ascending=False).head(10)

Number of articles for each category

In [None]:
df.groupby(['label']).size().reset_index(name='counts').plot.bar(x='label',
                                                                 y='counts',
                                                                 figsize=(5,4),
                                                                 color=plt.cm.Paired(np.arange(len(df))),
                                                                 legend=False,
                                                                 ylabel = 'Number of articles',
                                                                 rot=0
                                                                 )

Word cloud with most frequency words

In [None]:
wc_prova  = WordCloud().generate(' '.join(df['text']))
plt.imshow(wc_prova)
plt.axis("off")
plt.show()

Number of words per category analysis

In [None]:
df_copy = df.copy()

# Compute the number of words for each article
df_copy['word_counts'] = df_copy['text'].str.count(' ') + 1

In [None]:
df_word_amount_label = df_copy.groupby(['label'])['word_counts'].sum().reset_index(name='counts')

In [None]:
fig = plt.figure(figsize = (5, 4))
plt.bar(df_word_amount_label['label'],
        df_word_amount_label['counts'], 
        color = plt.cm.Paired(np.arange(len(df_word_amount_label))))
plt.ticklabel_format(style='plain')
plt.show()

Average number of words for each abstract

In [13]:
df_copy.groupby(['label'])['word_counts'].mean().reset_index(name='mean')

Unnamed: 0,label,mean
0,0,55.621524
1,1,64.796
2,2,67.351
3,3,24.261437
4,4,42.03498
5,5,74.8004
6,6,65.750444
7,7,51.344333
