In [None]:
!pip install requests parsel playwright

In [1]:
# Read modules 
from bs4 import BeautifulSoup
import requests, lxml, os, json
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
from datetime import date 

## 1. Google scholar 

In [2]:
from parsel import Selector
import requests, re, json

def scrape_all_profiles_from_university(university_name: str):

    # https://docs.python-requests.org/en/master/user/quickstart/#passing-parameters-in-urls
    params = {
        "view_op": "search_authors",  # author results
        "mauthors": university_name,  # search query
        "hl": "en",                   # language
        "astart": 0                   # page number
    }

    # https://docs.python-requests.org/en/master/user/quickstart/#custom-headers
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.87 Safari/537.36",
    }

    profile_results = []

    profiles_is_present = True
    while profiles_is_present:

        html = requests.get("https://scholar.google.com/citations", params=params, headers=headers, timeout=30)
        select = Selector(html.text)

        # print(f"extracting authors at page #{params['astart']}.")

        for profile in select.css(".gs_ai_chpr"):
            name = profile.css(".gs_ai_name a::text").get()
            link = f'https://scholar.google.com{profile.css(".gs_ai_name a::attr(href)").get()}'
            
            affiliations = profile.css(".gs_ai_aff").xpath('normalize-space()').get()
            email = profile.css(".gs_ai_eml::text").get()
            cited_by = re.search(r"\d+", profile.xpath('//div[@class="gs_ai_cby"]').get()).group()  # Cited by 17143 -> 17143
            interests = profile.css(".gs_ai_one_int::text").getall()

            profile_results.append({
                "profile_name": name,
                "profile_link": link,
                "profile_affiliations": affiliations,
                "profile_email": email,
                "profile_city_by_count": cited_by,
                "profile_interests": interests
            })

        # if next page token is present -> update next page token and increment 10 to get the next page
        if select.css("button.gs_btnPR::attr(onclick)").get():
            # https://regex101.com/r/e0mq0C/1
            params["after_author"] = re.search(r"after_author\\x3d(.*)\\x26", select.css("button.gs_btnPR::attr(onclick)").get()).group(1)  # -> XB0HAMS9__8J
            params["astart"] += 10
        else:
            profiles_is_present = False
            
    return profile_results


# print(json.dumps(scrape_all_profiles_from_university(university_name="EM Normandie"), indent=2, ensure_ascii=False))
data = pd.DataFrame(scrape_all_profiles_from_university(university_name="EM Normandie"))
data['author_ID'] = data['profile_link'].apply(lambda x: x.split('=')[-1])
data.drop(['profile_affiliations', 'profile_email', 'profile_city_by_count'],axis =1, inplace = True)
data.head(3)

Unnamed: 0,profile_name,profile_link,profile_interests,author_ID
0,Ana Beatriz Lopes de Sousa Jabbour,https://scholar.google.com/citations?hl=en&use...,"[Sustainable Supply Chain Management, Circular...",oOmOcaAAAAAJ
1,Christophe Rosenberger,https://scholar.google.com/citations?hl=en&use...,"[Biometrics, Security & Privacy]",TpTmMrkAAAAJ
2,Jean-Christophe Avice,https://scholar.google.com/citations?hl=en&use...,"[Nitrogen and Sulfur Use Efficiency, Nutrients...",TkTj0I4AAAAJ


In [3]:
## Take the names from google scholar 
# import requests, lxml, os

headers = {
    'User-agent':
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}

## get the data for each author 
citations_al = []; citations_sinc = []; h_index_al = []; h_index_sinc= []; i10_index_al = []; i10_index_sinc = []
for link in data['profile_link']:
    html = requests.get(link, headers=headers).text
    soup = BeautifulSoup(html, 'lxml')
    # print(f'Collecting data for: {name}')
    for cited_by_public_access in soup.select('.gsc_rsb'):
        try:
            citations_all = cited_by_public_access.select_one('tr:nth-child(1) .gsc_rsb_sc1+ .gsc_rsb_std').text
        except:
            None  
        try:    
            citations_since = cited_by_public_access.select_one('tr:nth-child(1) .gsc_rsb_std+ .gsc_rsb_std').text
        except:
            None    
        try:
            h_index_all = cited_by_public_access.select_one('tr:nth-child(2) .gsc_rsb_sc1+ .gsc_rsb_std').text
        except:
            None 
        try:
            h_index_since = cited_by_public_access.select_one('tr:nth-child(2) .gsc_rsb_std+ .gsc_rsb_std').text
        except:
            None 
        try: 
            i10_index_all = cited_by_public_access.select_one('tr~ tr+ tr .gsc_rsb_sc1+ .gsc_rsb_std').text
        except: 
            None 
        try: 
            i10_index_since = cited_by_public_access.select_one('tr~ tr+ tr .gsc_rsb_std+ .gsc_rsb_std').text
        except: 
            None 
        # append the output into a list 
        citations_al.append(citations_all)
        citations_sinc.append(citations_since)
        h_index_al.append(h_index_all)
        h_index_sinc.append(h_index_since)
        i10_index_al.append(i10_index_all)
        i10_index_sinc.append(i10_index_since)
    # print(f'{citations_all}\n{citations_since2016}\n{h_index_all}\n{h_index_2016}\n{i10_index_all}\n{i10_index_2016}')
extra_data = pd.DataFrame([citations_al, citations_sinc, h_index_al, h_index_sinc, i10_index_al, i10_index_sinc]).T
extra_data.columns = ['Total citations', 'Citation l 5 yrs', 'Total H-index', "H-index l 5 yrs", "Total i10-index", 'i10-index l 5 yrs']
output = pd.concat([data, extra_data], axis =1)

In [4]:
import pandas as pd
from bs4 import BeautifulSoup
import requests, lxml, os

from serpapi import GoogleScholarSearch
from urllib.parse import urlsplit, parse_qsl



def scrape_all_authors_articles(author_id: str):
    params = {
        "user": author_id,   # user-id
        "hl": "en",          # language
        "gl": "us",          # country to search from
        "cstart": 0,         # articles page. 0 is the first page
        "pagesize": "100"    # articles per page
        }

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.41 Safari/537.36",
        }

    all_articles = []

    articles_is_present = True
    while articles_is_present:
        html = requests.post("https://scholar.google.com/citations", params=params, headers=headers, timeout=30)
        soup = BeautifulSoup(html.text, "lxml")

        for index, article in enumerate(soup.select(".gsc_a_tr"), start=1):
            try:
                article_title = article.select_one(".gsc_a_at").text
            except: article_title = None

            try:
                article_link = f'https://scholar.google.com{article.select_one(".gsc_a_at")["href"]}'
            except: article_link = None

            try:
                article_authors = article.select_one(".gsc_a_at+ .gs_gray").text
            except: article_authors = None

            try:
                article_publication = article.select_one(".gs_gray+ .gs_gray").text
            except: article_publication = None
                
            try:
                article_year = article.select_one(".gsc_a_hc").text
            except: article_year = None

            all_articles.append({
                "title": article_title,
                "link": article_link,
                "authors": article_authors,
                "publication": article_publication,
                "article_year": article_year
                })

        # this selector is checking for the .class that contains: "There are no articles in this profile."
        # example link: https://scholar.google.com/citations?user=VjJm3zYAAAAJ&hl=en&cstart=500&pagesize=100
        if soup.select_one(".gsc_a_e"):
            articles_is_present = False
        else:
            params["cstart"] += 100  # paginate to the next page

    pd.DataFrame(data=all_articles).to_csv(f"Articles/google_scholar_{params['user']}_articles.csv", encoding="utf-8", index=False)

user_ids = data['author_ID'].tolist()

for _id in user_ids:
    scrape_all_authors_articles(author_id=_id)
    
    
# # SerpApi solution

# def serpapi_scrape_all_author_articles(author_id: str):
#     params = {
#         "api_key": os.getenv("API_KEY"),    # your SerpApi API key
#         "engine": "google_scholar_author",  # search engine
#         "hl": "en",                         # language
#         "author_id": author_id,             # author ID
#         "start": "0",                       # articles page
#         "num": "100"                        # articles per page
#         }

#     search = GoogleScholarSearch(params)    # where data extraction happens on SerpApi backend.

#     all_articles = []

#     articles_is_present = True
#     while articles_is_present:
#         results = search.get_dict()         # JSON -> Python dictionary

#         for index, article in enumerate(results["articles"], start=1):
#             title = article.get("title")
#             link = article.get("link")
#             authors = article.get("authors")
#             publication = article.get("publication")
#             citation_id = article.get("citation_id")
#             year = article.get("year")

#             print(title)

#             all_articles.append({
#                 "title": title,
#                 "link": link,
#                 "authors": authors,
#                 "publication": publication,
#                 "citation_id": citation_id,
#                 "year": year
#                 })

#         if "next" in results.get("serpapi_pagination", []):
#             # split URL in parts as a dict() and update "search" variable to a new page
#             search.params_dict.update(dict(parse_qsl(urlsplit(results["serpapi_pagination"]["next"]).query)))
#         else:
#             articles_is_present = False

#     pd.DataFrame(data=all_articles).to_csv(f"serpapi_google_scholar_{params['author_id']}_articles.csv", encoding="utf-8", index=False)


# serpapi_scrape_all_author_articles(author_id="VjJm3zYAAAAJ")

In [5]:
## read the tables 
import os 
files= [file for file in os.listdir('Articles/') if file.endswith('.csv')]

df = pd.DataFrame()

for file in files:
    read = pd.read_csv('Articles'+'/'+file)
    print(file)
    read['author_ID'] = file.split('_')[2]
    df = df.append(read)
    
df.head()

google_scholar_2T8gU8AAAAAJ_articles.csv


  df = df.append(read)
  df = df.append(read)
  df = df.append(read)
  df = df.append(read)
  df = df.append(read)
  df = df.append(read)
  df = df.append(read)
  df = df.append(read)
  df = df.append(read)
  df = df.append(read)
  df = df.append(read)
  df = df.append(read)
  df = df.append(read)
  df = df.append(read)


google_scholar_4KLCJVgAAAAJ_articles.csv
google_scholar_4V2lG70AAAAJ_articles.csv
google_scholar_4ZCoTcUAAAAJ_articles.csv
google_scholar_7MtUDEwAAAAJ_articles.csv
google_scholar_8R7Cg9AAAAAJ_articles.csv
google_scholar_9ZyidVQAAAAJ_articles.csv
google_scholar_BcTSDckAAAAJ_articles.csv
google_scholar_bHUDVG8AAAAJ_articles.csv
google_scholar_bMvX7x4AAAAJ_articles.csv
google_scholar_b_DFLQQAAAAJ_articles.csv
google_scholar_C6oJtcgAAAAJ_articles.csv
google_scholar_cnlixw0AAAAJ_articles.csv
google_scholar_cqyUcDoAAAAJ_articles.csv
google_scholar_D0slPrsAAAAJ_articles.csv


  df = df.append(read)
  df = df.append(read)
  df = df.append(read)
  df = df.append(read)
  df = df.append(read)
  df = df.append(read)
  df = df.append(read)
  df = df.append(read)
  df = df.append(read)
  df = df.append(read)
  df = df.append(read)
  df = df.append(read)
  df = df.append(read)
  df = df.append(read)
  df = df.append(read)


google_scholar_Db4aA10AAAAJ_articles.csv
google_scholar_DgrxQGwAAAAJ_articles.csv
google_scholar_DP_OufkAAAAJ_articles.csv
google_scholar_DtnlwoAAAAAJ_articles.csv
google_scholar_Ex4TwC8AAAAJ_articles.csv
google_scholar_f-nuL0sAAAAJ_articles.csv
google_scholar_FsTjmgUAAAAJ_articles.csv
google_scholar_fz4zTlEAAAAJ_articles.csv
google_scholar_g7lQUh8AAAAJ_articles.csv
google_scholar_g9DEiLEAAAAJ_articles.csv
google_scholar_GVdUNXcAAAAJ_articles.csv
google_scholar_H6_9aUcAAAAJ_articles.csv
google_scholar_ht_RNEoAAAAJ_articles.csv
google_scholar_iAjlUXIAAAAJ_articles.csv
google_scholar_IeGHNosAAAAJ_articles.csv


  df = df.append(read)
  df = df.append(read)
  df = df.append(read)
  df = df.append(read)
  df = df.append(read)
  df = df.append(read)
  df = df.append(read)
  df = df.append(read)
  df = df.append(read)
  df = df.append(read)
  df = df.append(read)
  df = df.append(read)
  df = df.append(read)
  df = df.append(read)
  df = df.append(read)
  df = df.append(read)
  df = df.append(read)


google_scholar_iJUWw-EAAAAJ_articles.csv
google_scholar_J1cWljgAAAAJ_articles.csv
google_scholar_JmjHO0EAAAAJ_articles.csv
google_scholar_jmQ8r44AAAAJ_articles.csv
google_scholar_KFLFnasAAAAJ_articles.csv
google_scholar_KZjz3gwAAAAJ_articles.csv
google_scholar_M4epQ9sAAAAJ_articles.csv
google_scholar_MH633zgAAAAJ_articles.csv
google_scholar_mNELtuMAAAAJ_articles.csv
google_scholar_mYcyfrsAAAAJ_articles.csv
google_scholar_myltOpUAAAAJ_articles.csv
google_scholar_oOmOcaAAAAAJ_articles.csv
google_scholar_OUuVpSQAAAAJ_articles.csv
google_scholar_oUVvj1MAAAAJ_articles.csv
google_scholar_P3-N38IAAAAJ_articles.csv
google_scholar_Ql8nMHIAAAAJ_articles.csv
google_scholar_R80_WFUAAAAJ_articles.csv


  df = df.append(read)
  df = df.append(read)
  df = df.append(read)
  df = df.append(read)
  df = df.append(read)
  df = df.append(read)
  df = df.append(read)
  df = df.append(read)
  df = df.append(read)
  df = df.append(read)
  df = df.append(read)
  df = df.append(read)
  df = df.append(read)
  df = df.append(read)
  df = df.append(read)


google_scholar_RfEoWAgAAAAJ_articles.csv
google_scholar_rN_7AkoAAAAJ_articles.csv
google_scholar_TkTj0I4AAAAJ_articles.csv
google_scholar_TpTmMrkAAAAJ_articles.csv
google_scholar_TqBwgC4AAAAJ_articles.csv
google_scholar_TSXET9gAAAAJ_articles.csv
google_scholar_Twj3qDQAAAAJ_articles.csv
google_scholar_vg4TNUkAAAAJ_articles.csv
google_scholar_vNx4Sz8AAAAJ_articles.csv
google_scholar_x-RZHpkAAAAJ_articles.csv
google_scholar_xaceBrYAAAAJ_articles.csv
google_scholar_Xf2BqssAAAAJ_articles.csv
google_scholar_xglEZBkAAAAJ_articles.csv
google_scholar_XmiJS4AAAAAJ_articles.csv
google_scholar_xsoh9wcAAAAJ_articles.csv
google_scholar_YCSIH30AAAAJ_articles.csv
google_scholar_YseQa5wAAAAJ_articles.csv
google_scholar_yxk7n1kAAAAJ_articles.csv
google_scholar_zuUqL2IAAAAJ_articles.csv
google_scholar_zZG_s_8AAAAJ_articles.csv
google_scholar__fjAt0QAAAAJ_articles.csv
google_scholar__TyiCq4AAAAJ_articles.csv


  df = df.append(read)
  df = df.append(read)
  df = df.append(read)
  df = df.append(read)
  df = df.append(read)
  df = df.append(read)
  df = df.append(read)
  df = df.append(read)


Unnamed: 0,title,link,authors,publication,article_year,author_ID
0,Economic policy uncertainty and stock markets:...,https://scholar.google.com/citations?view_op=v...,"M Arouri, C Estay, C Rault, D Roubaud","Finance Research Letters 18, 136-141, 2016",2016.0,2T8gU8AAAAAJ
1,Entrepreneurship: From motivation to start-up,https://scholar.google.com/citations?view_op=v...,"C Estay, F Durrieu, M Akhter",Journal of international Entrepreneurship 11 (...,2013.0,2T8gU8AAAAAJ
2,How and when compulsory citizenship behavior l...,https://scholar.google.com/citations?view_op=v...,"P He, Z Peng, H Zhao, C Estay","Journal of Business Ethics 155 (1), 259-274, 2019",2019.0,2T8gU8AAAAAJ
3,Compulsory citizenship behavior and employee s...,https://scholar.google.com/citations?view_op=v...,"P He, X Wang, M Wu, C Estay",Social Behavior and Personality: an internatio...,2018.0,2T8gU8AAAAAJ
4,How destructive leadership influences compulso...,https://scholar.google.com/citations?view_op=v...,"M Wu, Z Peng, C Estay","Chinese Management Studies, 2018",2018.0,2T8gU8AAAAAJ


In [6]:
dataframe = pd.merge(output, df, on = 'author_ID')

In [7]:
dataframe.drop('link', axis = 1, inplace = True)
# dataframe['article_year'].astype(int)
dataframe

Unnamed: 0,profile_name,profile_link,profile_interests,author_ID,Total citations,Citation l 5 yrs,Total H-index,H-index l 5 yrs,Total i10-index,i10-index l 5 yrs,title,authors,publication,article_year
0,Ana Beatriz Lopes de Sousa Jabbour,https://scholar.google.com/citations?hl=en&use...,"[Sustainable Supply Chain Management, Circular...",oOmOcaAAAAAJ,9842,8680,49,44,109,102,Selecting green suppliers based on GSCM practi...,"D Kannan, ABL de Sousa Jabbour, CJC Jabbour",European Journal of operational research 233 (...,2014.0
1,Ana Beatriz Lopes de Sousa Jabbour,https://scholar.google.com/citations?hl=en&use...,"[Sustainable Supply Chain Management, Circular...",oOmOcaAAAAAJ,9842,8680,49,44,109,102,Industry 4.0 and the circular economy: a propo...,"ABL de Sousa Jabbour, CJC Jabbour, M Godinho F...","Annals of Operations Research 270 (1), 273-286...",2018.0
2,Ana Beatriz Lopes de Sousa Jabbour,https://scholar.google.com/citations?hl=en&use...,"[Sustainable Supply Chain Management, Circular...",oOmOcaAAAAAJ,9842,8680,49,44,109,102,When titans meet–Can industry 4.0 revolutionis...,"ABL de Sousa Jabbour, CJC Jabbour, C Foropon, ...",Technological Forecasting and Social Change 13...,2018.0
3,Ana Beatriz Lopes de Sousa Jabbour,https://scholar.google.com/citations?hl=en&use...,"[Sustainable Supply Chain Management, Circular...",oOmOcaAAAAAJ,9842,8680,49,44,109,102,Green human resource management and green supp...,"CJC Jabbour, ABL de Sousa Jabbour","Journal of Cleaner Production 112, 1824-1833, ...",2016.0
4,Ana Beatriz Lopes de Sousa Jabbour,https://scholar.google.com/citations?hl=en&use...,"[Sustainable Supply Chain Management, Circular...",oOmOcaAAAAAJ,9842,8680,49,44,109,102,Environmental management and operational perfo...,"CJC Jabbour, ABL de Sousa Jabbour, K Govindan,...","Journal of Cleaner Production 47, 129-140, 2013",2013.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3365,Aneta Hamza-Orlinska,https://scholar.google.com/citations?hl=en&use...,[Human Resource Management],C6oJtcgAAAAJ,1,1,1,1,0,0,Corporate Commitment to Diversity in the Local...,AA HAmzA-OrlinskA,Journal of Corporate Responsibility and Leader...,2017.0
3366,Aneta Hamza-Orlinska,https://scholar.google.com/citations?hl=en&use...,[Human Resource Management],C6oJtcgAAAAJ,1,1,1,1,0,0,Corporate Commitment to Diversity in the Local...,A Orlinska,Central and Eastern Europe in the Changing Bus...,2017.0
3367,Aneta Hamza-Orlinska,https://scholar.google.com/citations?hl=en&use...,[Human Resource Management],C6oJtcgAAAAJ,1,1,1,1,0,0,,,,
3368,Eléonore Mandel,https://scholar.google.com/citations?hl=en&use...,[Intercultural management - Intercultural …],bMvX7x4AAAAJ,1,1,1,1,0,0,The French culturalist way: an interpretative ...,E Mandel,"M@ n@ gement 15 (4), 441-451, 2012",2012.0


In [8]:
dataframe = dataframe[dataframe['profile_name'].isin(dq_final['profile_name'])]

NameError: name 'dq_final' is not defined

In [None]:
dataframe.drop('profile_link', axis = 1, inplace = True)

In [None]:
dataframe

In [None]:
dataframe.to_excel('../Output/Large_dataframe.xlsx', index = False)

### citations across the years 

In [None]:
from serpapi import GoogleSearch
author_results_data = []

for author_ID in data['author_ID']:

    params = {
            "api_key": "a2491515737940ffae50167db32b18bf11564b985a3ad4d4e43f9cdae0b6f5eb",      # SerpApi API key
            "engine": "google_scholar_author",    # author results search engine
            "author_id": f"{author_ID}",  # search query: _xwYD2sAAAAJ
            "hl": "en"
        }

    search = GoogleSearch(params)
    results = search.get_dict()

#     thumbnail = results.get("author").get("thumbnail")
#     name = results.get("author").get("name")
#     affiliations = results.get("author").get("affiliations")
#     email = results.get("author").get("email")
#     website = results.get("author").get("website")
#     interests = results.get("author").get("interests")

#     cited_by_table = results.get("cited_by", {}).get("table")
    cited_by_graph = results.get("cited_by", {}).get("graph")

    # public_access_link = results.get("public_access", {}).get("link")
    # available_public_access = results.get("public_access", {}).get("available")
    # not_available_public_access = results.get("public_access", {}).get("not_available")
    # co_authors = results.get("co_authors")

    author_results_data.append({
      # "thumbnail": thumbnail,
      # "name": name,
      # "affiliations": affiliations,
      # "email": email,
      # "website": website,
      # "interests": interests,
      # "cited_by_table": cited_by_table,
      "cited_by_graph": cited_by_graph,
      # "public_access_link": public_access_link,
      # "available_public_access": available_public_access,
      # "not_available_public_access": not_available_public_access,
      # "co_authors": co_authors
      "author_ID": author_ID 
    })

In [None]:
s = pd.DataFrame(author_results_data)
s.head(20)

In [None]:
s = s[s['cited_by_graph'].notna()]

k = {}

for i in s['author_ID']:
    q = s[s['author_ID'] == i][['cited_by_graph']].iloc[0][0]
    k[i] = pd.DataFrame.from_dict(q)
    

In [None]:
final = pd.DataFrame()

for v in k.keys():
    da = k.get(v)
    da['author_ID'] = v 
    final = final_df.append(da)

In [None]:
final['year'] = pd.to_datetime(final['year'], format='%Y')
final

In [None]:
dd = pd.merge(data, final, on = 'author_ID')
dd

In [None]:
df_final = dd.pivot_table(index = ['profile_name', 'author_ID'], columns = 'year', values = 'citations')
df_final.fillna(0, inplace = True)
df_final = df_final.iloc[:, -13:]
df_final.columns = [str(i).split('-')[0] for i in df_final.columns]
df_final = df_final.reset_index()
df_final.drop(['profile_name'], axis =1, inplace = True)

In [None]:
dq = pd.merge(output, df_final, on = 'author_ID')
dq.drop(['profile_link', 'author_ID'], axis =1, inplace = True)
dq

In [None]:
per = pd.read_excel('../input/PP récap 02-22.xlsx')
per.columns = ['fam_name', 'name']
one = per['fam_name']
one.columns = ['name']
two = per['name']
per = pd.concat([one, two])
per

In [None]:
## filter the dataframe 
EM_name_list = []

for name in per:
    for full in dq['profile_name']:
        if name.lower() in full.lower():
            EM_name_list.append(full)

In [None]:
pub = dataframe.groupby('profile_name').size().to_frame()
pub = pub.reset_index()
pub.columns = ['profile_name', 'Number of publications']
pub

In [None]:
dq_final = dq[dq['profile_name'].isin(EM_name_list)]
mask = ['Christophe Rosenberger', 'Jean-Christophe Avice', 'Paul Honeine', 'Nabil Khelil', 'Amandine Laré', 'Diana Santistevan', 'Arnaud DELANNOY', 'Vincent Meyer']
dq_final = dq_final[~(dq_final['profile_name'].isin(mask))]
dq_final =dq_final.reset_index()
dq_final.drop('index', axis =1, inplace = True)
dq_final

In [None]:
dq_final = pd.merge(dq_final, pub, on ='profile_name', how = 'left')
dq_final 

In [None]:
dq_final.to_excel('../Output/Citation_dataframe.xlsx', index = False)

# 3. EDA

In [200]:
ot = {}

for i in df['author_ID'].tolist():
    s = df[df['author_ID'] == i]
    titles = [" - ".join(s['title'])]
    ot[i] = titles
    
ot = pd.DataFrame(ot).T.reset_index()
ot.columns = ['author_ID', 'Titles']
ot

TypeError: sequence item 64: expected str instance, float found

In [201]:
data = pd.merge(data, ot, on = 'author_ID')

TypeError: Can only merge Series or DataFrame objects, a <class 'dict'> was passed

In [None]:
data.to_excel('../Output/Dataset.xlsx')

In [None]:
from wordcloud import WordCloud

In [None]:
## clean the coprus 
import nltk
from nltk.corpus import stopwords
print(stopwords.words('french'))

In [None]:
from nltk.tokenize import word_tokenize
data['Titles'] = data['Titles'].apply(word_tokenize)

In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english') + stopwords.words('french'))
data['Titles']= data['Titles'].apply(lambda x:[word for word in x if word not in stop_words])

In [None]:
m = " ".join(data['Titles'].iloc[0])

In [None]:
word_cloud = WordCloud(width=1200, height=800, collocations = False, background_color = 'white').generate(m)
plt.imshow(word_cloud, interpolation='bilinear', cmap = 'Reds')
plt.axis("off")
plt.title('EM Normandie Business School wordcould', loc = 'left')
plt.tight_layout(pad=0)
plt.savefig('EM.png', dpi = 400)
# plt.savefig(f'../WordCould/{i}.png', dpi = 400, facecolor = 'w')

In [None]:
## Display the generated Word Cloud

for i in data['profile_name']:
    d = " ".join(data[data['profile_name'] == i].Titles.iloc[0])
    word_cloud = WordCloud(width=1200, height=800, collocations = False, background_color = 'white').generate(d)
    plt.imshow(word_cloud, interpolation='bilinear', cmap = 'Reds')
    plt.axis("off")
    plt.title(f'Author: {i}', loc = 'left')
    plt.tight_layout(pad=0)
    plt.savefig(f'../WordCould/{i}.png', dpi = 400, facecolor = 'w')
    # plt.show()