In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

In [25]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting regex>=2021.8.3
  Downloading regex-2023.6.3-cp39-cp39-macosx_11_0_arm64.whl (288 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.0/289.0 kB[0m [31m30.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: regex, nltk
Successfully installed nltk-3.8.1 regex-2023.6.3


In [26]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/hakanmeva/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [27]:
from nltk.tokenize import sent_tokenize

In [2]:
years = [2016, 2017, 2018, 2019, 2020, 2021, 2022]

In [5]:
df_burse_list = []
for year in years:
    df = pd.read_parquet('datasets/df_burse_fonduri_mutuale_' + str(year) + '.parquet')
    df_burse_list.append(df)
df_burse = pd.concat(df_burse_list)

In [8]:
df_pers_list = []
for year in years:
    df = pd.read_parquet('datasets/df_finante_pers_' + str(year) + '.parquet')
    df_pers_list.append(df)
df_pers = pd.concat(df_pers_list)

## General Stats

In [12]:
print('Length of Burse fonduri mutuale dataset: {}'.format(df_burse.shape[0]))
print('Length of Finante personale dataset: {}'.format(df_pers.shape[0]))

Length of Burse fonduri mutuale dataset: 9564
Length of Finante personale dataset: 8417


In [22]:
burse_mem = 0
for year in years:
    burse_mem += os.path.getsize('datasets/df_burse_fonduri_mutuale_' + str(year) + '.parquet')
print('Total file size for Burse fonduri mutuale dataset: {}M'.format(burse_mem / 1000000))
burse_pers = 0
for year in years:
    burse_pers += os.path.getsize('datasets/df_finante_pers_' + str(year) + '.parquet')
print('Total file size for Finante personale dataset: {}M'.format(burse_pers / 1000000))

Total file size for Burse fonduri mutuale dataset: 11.91284M
Total file size for Finante personale dataset: 12.700395M


### Burse DS Sentence Stats

In [46]:
df_burse['title_sent_len'] = df_burse.title.apply(sent_tokenize).str.len()
df_burse['headline_sent_len'] =df_burse.headline.apply(sent_tokenize).str.len()
df_burse['summary_sent_len'] = df_burse.description.apply(sent_tokenize).str.len()
df_burse['article_sent_len'] =df_burse.article_text.apply(sent_tokenize).str.len()

In [65]:
df_burse[df_burse.title_sent_len > 0]['title_sent_len'].describe()

count    9564.000000
mean        1.831242
std         0.987087
min         1.000000
25%         1.000000
50%         2.000000
75%         2.000000
max         8.000000
Name: title_sent_len, dtype: float64

In [66]:
df_burse[df_burse.title_sent_len > 0]['title_sent_len'].sum()

17514

In [67]:
df_burse[df_burse.headline_sent_len > 0]['headline_sent_len'].describe()

count    9564.000000
mean        1.512965
std         0.706359
min         1.000000
25%         1.000000
50%         1.000000
75%         2.000000
max         5.000000
Name: headline_sent_len, dtype: float64

In [70]:
df_burse[df_burse.headline_sent_len > 0]['headline_sent_len'].sum()

14470

In [71]:
df_burse[df_burse.summary_sent_len > 0]['summary_sent_len'].describe()

count    6799.000000
mean        1.398294
std         0.785967
min         1.000000
25%         1.000000
50%         1.000000
75%         2.000000
max        16.000000
Name: summary_sent_len, dtype: float64

In [72]:
df_burse[df_burse.summary_sent_len > 0]['summary_sent_len'].sum()

9507

In [73]:
df_burse[df_burse.article_sent_len > 0]['article_sent_len'].describe()

count    9149.000000
mean        9.233250
std         9.661274
min         1.000000
25%         4.000000
50%         7.000000
75%        11.000000
max       169.000000
Name: article_sent_len, dtype: float64

In [74]:
df_burse[df_burse.article_sent_len > 0]['article_sent_len'].sum()

84475

## Finante Pers DS Sentence Stats

In [55]:
df_pers['title_sent_len'] = df_pers.title.apply(sent_tokenize).str.len()
df_pers['headline_sent_len'] =df_pers.headline.apply(sent_tokenize).str.len()
df_pers['summary_sent_len'] = df_pers.description.apply(sent_tokenize).str.len()
df_pers['article_sent_len'] =df_pers.article_text.apply(sent_tokenize).str.len()

In [75]:
df_pers[df_pers.title_sent_len > 0]['title_sent_len'].describe()

count    8417.000000
mean        1.827848
std         1.003058
min         1.000000
25%         1.000000
50%         2.000000
75%         2.000000
max         8.000000
Name: title_sent_len, dtype: float64

In [76]:
df_pers[df_pers.title_sent_len > 0]['title_sent_len'].sum()

15385

In [77]:
df_pers[df_pers.headline_sent_len > 0]['headline_sent_len'].describe()

count    8417.000000
mean        1.499228
std         0.658760
min         1.000000
25%         1.000000
50%         1.000000
75%         2.000000
max         5.000000
Name: headline_sent_len, dtype: float64

In [78]:
df_pers[df_pers.headline_sent_len > 0]['headline_sent_len'].sum()

12619

In [79]:
df_pers[df_pers.summary_sent_len > 0]['summary_sent_len'].describe()

count    7521.000000
mean        1.398218
std         0.743798
min         1.000000
25%         1.000000
50%         1.000000
75%         2.000000
max         7.000000
Name: summary_sent_len, dtype: float64

In [80]:
df_pers[df_pers.summary_sent_len > 0]['summary_sent_len'].sum()

10516

In [81]:
df_pers[df_pers.article_sent_len > 0]['article_sent_len'].describe()

count    8411.000000
mean       10.593865
std        14.979656
min         1.000000
25%         5.000000
50%         7.000000
75%        11.000000
max       327.000000
Name: article_sent_len, dtype: float64

In [82]:
df_pers[df_pers.article_sent_len > 0]['article_sent_len'].sum()

89105

### Burse DS Words Stats

In [None]:
#include vocab stats
#include number of words contained in article
#plots histograms in parallel

## Finante Pers DS Word Stats

In [None]:
#include vocab stats
#include number of words contained in article
#plot histograms in parallel

In [None]:
#calc number of words from title, headline, summary that are found in the article body
#calc stats for references used in recommender system, how man have recoms, how manyu unique recoms