In [92]:
! pip install pandas python-dotenv openpyxl -q


[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import json
import pandas as pd
from dotenv import load_dotenv

load_dotenv()

BASE_PATH = "../1 - scrapping/"
THIS_FOLDER = "../2 - data_handling/"
MANUAL_FILTERING_PATH = "../3 - manual_filtering/"


In [34]:

with open(f'{BASE_PATH}/search_strings.json', 'r') as f:
    search_string_json = json.load(f)

categories = [category.get("category") for category in search_string_json]
categories

['General',
 'Classic Methods',
 'Dense Embeddings',
 'Contextual Embeddings',
 'Hybrid Methods',
 'Comparison of methods',
 'Literature Reviews',
 'Applied cases']

In [39]:

def read_list_of_csvs(file_list) -> pd.DataFrame:
    df_list = []
    for file in file_list:
        df = pd.read_csv(file)
        df['filename'] = file
        df_list.append(df)
    combined_df = pd.concat(df_list, ignore_index=True)
    return combined_df


In [40]:

google_scholar_files = [f"{BASE_PATH}/result/google_scholar_search_results.csv"]
df_google_scholar = read_list_of_csvs(google_scholar_files)

springer_ijdsr_files = [
    f"{BASE_PATH}/result/springer_ijdsr_search_results_{category}.csv"
    for category in categories
]
df_springer_ijdsr = read_list_of_csvs(springer_ijdsr_files)


In [37]:
pd.set_option('display.max_rows', 500)

In [41]:

df_google_scholar.head()

# Inclusion Criterias
# 1. Includes Feature extraction
#   I'll read every single title to filter those that includes feature extraction
#   Afterwards, I'll read the abstracts of the remaining papers to further filter those that includes feature extraction
# 2. Text as regression model variables
#   I'll read every single title to filter those that includes text as regression model variables
#   Afterwards, I'll read the abstracts of the remaining papers to further filter those that includes text as regression model variables
# 3. Journal or Conference
#   I can filter using the str_article tags, such as "[HTML]" 
# 4. Published on the last 3 years (2025, 2024, 2023)
#  I'll filter using the 'Year' column


Unnamed: 0,str_search_string,nr_page,nr_order,str_article,nr_year,nr_citations,str_original_Link,str_pdf_link,category,filename
0,"""text regression"" AND (""feature extraction"" OR...",1,1,"Text regressionanalysis: A review, empirical, ...",2024.0,3,https://ieeexplore.ieee.org/abstract/document/...,https://ieeexplore.ieee.org/iel8/6287639/10380...,General,../scrapping//result/google_scholar_search_res...
1,"""text regression"" AND (""feature extraction"" OR...",1,2,Multi-representation approach totext regressio...,2015.0,8,https://ieeexplore.ieee.org/abstract/document/...,https://www.fruct.org/files/publications/volum...,General,../scrapping//result/google_scholar_search_res...
2,"""text regression"" AND (""feature extraction"" OR...",1,3,[HTML]TextRegress: A Python package for advanc...,2025.0,0,https://www.sciencedirect.com/science/article/...,https://www.sciencedirect.com/science/article/...,General,../scrapping//result/google_scholar_search_res...
3,"""text regression"" AND (""feature extraction"" OR...",1,4,Gaussian processes fortext regression,2017.0,5,https://etheses.whiterose.ac.uk/id/eprint/17619/,https://etheses.whiterose.ac.uk/id/eprint/1761...,General,../scrapping//result/google_scholar_search_res...
4,"""text regression"" AND (""feature extraction"" OR...",1,5,Analyzing online review helpfulness using a re...,2012.0,52,https://dl.acm.org/doi/abs/10.1145/2229156.222...,https://dl.acm.org/doi/pdf/10.1145/2229156.222...,General,../scrapping//result/google_scholar_search_res...


In [42]:
# 4. Published on the last 3 years (2025, 2024, 2023)
print(df_google_scholar['nr_year'].value_counts())

total_before_filter = len(df_google_scholar)
df_google_scholar = df_google_scholar[df_google_scholar['nr_year'] >= 2023].copy()

print(f"Total articles before year filter: {total_before_filter}")
print(f"Total articles after year filter: {len(df_google_scholar)}")


nr_year
2022.0    64
2024.0    59
2025.0    58
2023.0    52
2021.0    44
2020.0    32
2019.0    23
2018.0    20
2017.0    19
2016.0    12
2015.0     7
2013.0     5
2014.0     1
2012.0     1
2008.0     1
2026.0     1
2011.0     1
2009.0     1
1992.0     1
2010.0     1
Name: count, dtype: int64
Total articles before year filter: 410
Total articles after year filter: 170


In [43]:
# 3. Journal or Conference
# As I can't say the article was reviewed by peers, let's only consider articles with at least one citation and with a valid link

total_before_filter = len(df_google_scholar)
df_google_scholar = df_google_scholar[(df_google_scholar['nr_citations'] > 0) & (df_google_scholar['str_original_Link'].notnull())].copy()

print(f"Total articles before journal/conference filter: {total_before_filter}")
print(f"Total articles after journal/conference filter: {len(df_google_scholar)}")


Total articles before journal/conference filter: 170
Total articles after journal/conference filter: 119


## International Journal of Data Science Review

In [44]:
# 4. Published on the last 3 years (2025, 2024, 2023)
print(df_springer_ijdsr['Publication Year'].value_counts())

total_before_filter = len(df_springer_ijdsr)
df_springer_ijdsr = df_springer_ijdsr[df_springer_ijdsr['Publication Year'] >= 2023].copy()

print(f"Total articles before year filter: {total_before_filter}")
print(f"Total articles after year filter: {len(df_springer_ijdsr)}")


Publication Year
2025    87
2024    57
2022    26
2023    26
2018     9
2021     7
2019     4
2017     4
2016     2
2026     2
2020     1
Name: count, dtype: int64
Total articles before year filter: 225
Total articles after year filter: 172


In [45]:

# Check if exists articles in both datasets
articles_in_both = [
    title 
    for title in df_springer_ijdsr['Item Title'] 
    if title in df_google_scholar['str_article'].values
]

print(articles_in_both)


[]


# Standardize to export

In [66]:

desired_format = {
    'title': str,
    'year': int, 
    'citations': int,
    'link': str,
    'pdf_link': str,
    'category': str,
    'source': str
}


In [87]:

df_rows = []

for _, row in df_google_scholar.iterrows():
    df_row = {
        'title': row['str_article'],
        'year': row['nr_year'],
        'citations': row['nr_citations'],
        'link': row['str_original_Link'],
        'pdf_link': row['str_pdf_link'],
        'category': row['category'],
        'source': 'Google Scholar'
    }
    df_rows.append(df_row)

for _, row in df_springer_ijdsr.iterrows():
    df_row = {
        'title': row['Item Title'],
        'year': row['Publication Year'],
        'citations': -1,
        'link': row['URL'],
        'pdf_link': None,
        'category': row['filename'].split('_')[-1].replace('.csv', ''),
        'source': 'Springer International Journal of Data Science and Analytics'
    }
    df_rows.append(df_row)

df_combined = pd.DataFrame(df_rows)
df_combined = df_combined.astype(desired_format)

print(df_combined.shape)

df_combined.head()

(291, 7)


Unnamed: 0,title,year,citations,link,pdf_link,category,source
0,"Text regressionanalysis: A review, empirical, ...",2024,3,https://ieeexplore.ieee.org/abstract/document/...,https://ieeexplore.ieee.org/iel8/6287639/10380...,General,Google Scholar
1,Dating Greek papyri withtext regression,2023,8,https://aclanthology.org/2023.acl-long.556/,https://aclanthology.org/2023.acl-long.556.pdf,General,Google Scholar
2,[HTML]Residential load forecasting based on lo...,2024,8,https://www.mdpi.com/2071-1050/16/24/11252,https://www.mdpi.com/2071-1050/16/24/11252,General,Google Scholar
3,Regression applied to legal judgments to predi...,2023,7,https://peerj.com/articles/cs-1225/,https://peerj.com/articles/cs-1225.pdf,General,Google Scholar
4,Forecasting Scientific Impact: A Model for Pre...,2025,1,http://iapress.org/index.php/soic/article/view...,http://iapress.org/index.php/soic/article/down...,General,Google Scholar


In [None]:

# Group same article that appears in multiple categories
df_combined_uniques = (

    df_combined
        .groupby([name for name in df_combined.columns if name != 'category'])
        .agg(list)
        .reset_index()

)

print(df_combined_uniques.shape)
display(df_combined_uniques.head())

(200, 7)


Unnamed: 0,title,year,citations,link,pdf_link,source,category
0,<i>SeNSe</i>: embedding alignment via semantic...,2024,-1,https://link.springer.com/article/10.1007/s410...,,Springer International Journal of Data Science...,"[Dense Embeddings, Applied cases]"
1,A MultilingualBERT EmbeddingsApproach in Ident...,2024,2,https://ieeexplore.ieee.org/abstract/document/...,,Google Scholar,[Contextual Embeddings]
2,A Pretraining Approach for Small-sample Traini...,2025,1,https://link.springer.com/article/10.1007/s109...,https://linchin.ndmctsgh.edu.tw/papers/2025/20...,Google Scholar,[Contextual Embeddings]
3,A Survey on the Impact of Pre-Trained Language...,2025,-1,https://link.springer.com/article/10.1007/s410...,,Springer International Journal of Data Science...,"[Classic Methods, Dense Embeddings, Comparison..."
4,A common-specific feature cross-fusion attenti...,2024,-1,https://link.springer.com/article/10.1007/s410...,,Springer International Journal of Data Science...,"[Dense Embeddings, Comparison of methods]"


In [93]:
import shutil

df_combined_uniques.to_excel(f'{THIS_FOLDER}/combined_unique_articles.xlsx', index=False)

shutil.copy2(
    f'{THIS_FOLDER}/combined_unique_articles.xlsx', 
    f'{MANUAL_FILTERING_PATH}/combined_unique_articles.xlsx'
)


NameError: name 'THIS_FOLDER' is not defined