In [None]:
# Enlarging the screen

from IPython.display import display, HTML

display(HTML(data="""
<style>
    div#notebook-container    { width: 95%; }
    div#menubar-container     { width: 85%; }
    div#maintoolbar-container { width: 99%; }
</style>
"""))

print ('Enlarging the screen is done!')

# Importing libraries

import numpy
from sqlalchemy import create_engine
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from tqdm.notebook import tqdm

pd.set_option('max_colwidth', 100)
pd.set_option('display.float_format', lambda x: '%.2f' % x)

pd.options.mode.chained_assignment = None  # default='warn'

print ('Libraries were imported successfully!')

# Loading data from sql

Server = 'LAPTOP-I7NEB9V3\SQLEXPRESS'
Database = 'Geopattern'
Driver = 'ODBC Driver 17 for SQL Server'
Database_Connection = f'mssql://@{Server}/{Database}?driver={Driver}'

engine = create_engine(Database_Connection)
connection = engine.connect()

df_my_data = pd.read_sql_query (
    "select * from my_data", connection)

df_Authors = pd.read_sql_query (
    "select * from my_data_Authors", connection)

df_Authors['Country'].replace({'Czechia': 'Czech Republic', 
                               'North Macedonia':'Macedonia'}, inplace=True)

country_list = [ 'Albania',
                 'Austria',
                 'Belarus',
                 'Belgium',
                 'Bosnia and Herzegovina',
                 'Bulgaria',
                 'Canada',
                 'Croatia',
                 'Cyprus',
                 'Czech Republic',
                 'Denmark',
                 'Estonia',
                 'Finland',
                 'France',
                 'Germany',
                 'Greece',
                 'Hungary',
                 'Iceland',
                 'Ireland',
                 'Italy',
                 'Latvia',
                 'Lithuania',
                 'Luxembourg',
                 'Macedonia',
                 'Malta',
                 'Moldova',
                 'Montenegro',
                 'Netherlands',
                 'Norway',
                 'Poland',
                 'Portugal',
                 'Romania',
                 'Russia',
                 'Serbia',
                 'Slovakia',
                 'Slovenia',
                 'Spain',
                 'Sweden',
                 'Switzerland',
                 'Ukraine',
                 'United Kingdom',
                 'United States']

df_Authors = df_Authors[df_Authors.Country.isin (country_list)]
df_Authors['Continent'] = df_Authors.Country.map(lambda x: 'North_America' if (x == 'Canada' or x == 'United States') else 'Europe')

df_west = df_my_data.merge(df_Authors, on = 'Author_ID', how = 'left')

df_west = df_west[['EID', 'Author_ID', 'Year', 'Country_y', 'Continent']]
df_west.rename(columns={'Country_y' : 'Country'}, inplace = True)

df_west = df_west[~df_west.Country.isnull()]

df_west.Year = df_west.Year.astype(int)

df_LDA = pd.read_csv(r'C:\Users\moham\Dropbox\QSE\Thesis\Geopattern\My data\df_LDA.csv')
df_LDA.set_index('EID', inplace = True)


print ('Loading data from sql is done!')


width_ind = 3
width_dep = 2

df_data_set = pd.DataFrame()

list_years = [2001,2003,2005,2007,2009,2011,2013]


# for yrs in range(1, 21 - width_ind - width_dep, 2):
#     list_years.append(2000 + yrs)


for yr in tqdm(list_years, desc = 'Preparing the data set'):

    Ind_win_start = yr
    Ind_win_end = Ind_win_start + width_ind - 1

    dep_win_start = Ind_win_end + 1
    dep_win_end = dep_win_start + width_dep - 1

    df_ind = df_west[(df_west.Year > (Ind_win_start - 1)) & (df_west.Year <= Ind_win_end)]
    df_dep = df_west[(df_west.Year > (dep_win_start - 1)) & (df_west.Year <= dep_win_end)]


    # Targets.................................................................................................. 

    df_count = df_ind.groupby('Author_ID').count()
    df_count2 = df_count[df_count.EID > 1]
    df_count2.reset_index (inplace = True)
    df_count2.Author_ID = df_count2.Author_ID.astype('int64')
    set_authors_ind = set(df_count2.Author_ID.tolist())

    df_dep.Author_ID = df_dep.Author_ID.astype('int64')    
    set_authors_dep = set(df_dep.Author_ID.tolist())

    #  ---------------------------------------------------------------------------------

    list_authors = list(set_authors_dep.intersection(set_authors_ind))
    list_authors.sort()


    df_authors_dep = pd.DataFrame(data = list_authors, columns = ['Author_ID'])

    df_dep.Author_ID = df_dep.Author_ID.astype(str)
    df_ind.Author_ID = df_ind.Author_ID.astype(str)
    df_authors_dep.Author_ID = df_authors_dep.Author_ID.astype(str)

    df_authors_dep.insert(1,'EIDs','')

    df_authors_dep = df_authors_dep.merge(df_Authors, on = 'Author_ID', how = 'left')

    authors_dep = []

    for i in tqdm(range (df_authors_dep.shape[0]), desc = 'loop 1'):
        for j in range (df_authors_dep.shape[0]):
            if j > i:
                authors_dep.append((df_authors_dep['Author_ID'][i] + df_authors_dep['Author_ID'][j], df_authors_dep['Author_ID'][i], df_authors_dep['Author_ID'][j]))

    df_data_set_dep = pd.DataFrame(data = authors_dep, columns=['Author_1_2', 'Author_1', 'Author_2'])

    df_data_set_dep = df_data_set_dep.set_index(['Author_1_2'])

    df_data_set_dep.insert(2,'number_of_collaborations',0)
    df_data_set_dep.insert(3,'collaboration_binary',0)

    df_dep.reset_index(inplace = True)

    df_authors_dep['EIDs'] = ''

    for i in tqdm(range (df_authors_dep.shape[0]), desc = 'loop 2'):
        df_authors_dep['EIDs'][i] = df_dep[df_dep.Author_ID == df_authors_dep.loc[i, 'Author_ID']].EID.tolist()

    res = []
    for i in tqdm(range (df_authors_dep.shape[0]), desc = 'loop 3'):
        res.append((df_authors_dep['Author_ID'][i],set(df_authors_dep['EIDs'][i])))


    collab_matrix = np.zeros((df_authors_dep.shape[0],df_authors_dep.shape[0]))

    for i in tqdm(range (len(res)), desc = 'loop 4'):
        for j in range (len(res)):
            collab_matrix[i,j] = len(res[i][1].intersection(res[j][1]))

    collab_list = []

    for i in tqdm(range (collab_matrix.shape[0]), desc = 'loop 5'):
        for j in range (collab_matrix.shape[0]):
            if j > i:
                if collab_matrix[i,j] != 0:
                    collab_list.append((df_authors_dep['Author_ID'][i] + df_authors_dep['Author_ID'][j], df_authors_dep['Author_ID'][i], df_authors_dep['Author_ID'][j], collab_matrix[i,j]))

    df_collab = pd.DataFrame(data = collab_list, columns=['Author_1_2', 'Author_1', 'Author_2', 'number_of_collaborations'])

    df_collab['collaboration_binary'] = df_collab.number_of_collaborations.map(lambda x: 1 if x > 0 else 0)

    print(f'{dep_win_start} to {dep_win_end} : Authors: {len(set(df_authors_dep.Author_ID.unique().tolist()))} collaborations: {df_collab.collaboration_binary.sum()}')

    df_collab.set_index('Author_1_2', inplace = True)
    df_data_set_dep.number_of_collaborations = df_collab.number_of_collaborations

    df_data_set_dep.collaboration_binary = df_data_set_dep.number_of_collaborations.map(lambda x: 1 if x > 0 else 0)

    # Features............................................................................................

    df_authors_ind = pd.DataFrame(data = list_authors, columns = ['Author_ID'])

    df_authors_ind.Author_ID = df_authors_ind.Author_ID.astype(str)

    df_authors_ind.insert(1,'EIDs','')
    df_authors_ind.insert(2,'partners',0)
    df_authors_ind.insert(3,'topic_1',0)
    df_authors_ind.insert(4,'topic_2',0)
    df_authors_ind.insert(5,'topic_3',0)
    df_authors_ind.insert(6,'topic_4',0)
    df_authors_ind.insert(7,'topic_5',0)
    df_authors_ind.insert(8,'topic_6',0)
    df_authors_ind.insert(9,'topic_7',0)
    df_authors_ind.insert(10,'topic_8',0)
    df_authors_ind.insert(11,'topic_9',0)


    df_authors_ind = df_authors_ind.merge(df_Authors, on = 'Author_ID', how = 'left')

    df_authors_ind.reset_index(inplace = True)
    df_ind.reset_index(inplace = True)

    df_authors_ind['EIDs'] = ''

    for i in tqdm(range (df_authors_ind.shape[0]), desc = 'loop 6'):
        df_authors_ind['EIDs'][i] = df_ind[df_ind.Author_ID == df_authors_ind.loc[i, 'Author_ID']].EID.tolist()
        df_authors_ind.loc[i, 'topic_1'] = df_LDA[df_LDA.index.isin(df_authors_ind.loc[i,'EIDs'])].topic_1.mean()
        df_authors_ind.loc[i, 'topic_2'] = df_LDA[df_LDA.index.isin(df_authors_ind.loc[i,'EIDs'])].topic_2.mean()
        df_authors_ind.loc[i, 'topic_3'] = df_LDA[df_LDA.index.isin(df_authors_ind.loc[i,'EIDs'])].topic_3.mean()
        df_authors_ind.loc[i, 'topic_4'] = df_LDA[df_LDA.index.isin(df_authors_ind.loc[i,'EIDs'])].topic_4.mean()
        df_authors_ind.loc[i, 'topic_5'] = df_LDA[df_LDA.index.isin(df_authors_ind.loc[i,'EIDs'])].topic_5.mean()
        df_authors_ind.loc[i, 'topic_6'] = df_LDA[df_LDA.index.isin(df_authors_ind.loc[i,'EIDs'])].topic_6.mean()
        df_authors_ind.loc[i, 'topic_7'] = df_LDA[df_LDA.index.isin(df_authors_ind.loc[i,'EIDs'])].topic_7.mean()
        df_authors_ind.loc[i, 'topic_8'] = df_LDA[df_LDA.index.isin(df_authors_ind.loc[i,'EIDs'])].topic_8.mean()
        df_authors_ind.loc[i, 'topic_9'] = df_LDA[df_LDA.index.isin(df_authors_ind.loc[i,'EIDs'])].topic_9.mean()

    res = []
    for i in tqdm(range (df_authors_ind.shape[0]), desc = 'loop 7'):
        res.append((df_authors_ind['Author_ID'][i],set(df_authors_ind['EIDs'][i])))


    collab_matrix = np.zeros((df_authors_ind.shape[0],df_authors_ind.shape[0]))

    for i in tqdm(range (len(res)), desc = 'loop 8'):
        for j in range (len(res)):
            collab_matrix[i,j] = len(res[i][1].intersection(res[j][1]))

    collab_list = []

    for i in tqdm(range (collab_matrix.shape[0]), desc = 'loop 9'):
        for j in range (collab_matrix.shape[0]):
            if j > i:
                if collab_matrix[i,j] != 0:
                    collab_list.append((df_authors_ind['Author_ID'][i] + df_authors_ind['Author_ID'][j], df_authors_ind['Author_ID'][i], df_authors_ind['Author_ID'][j], collab_matrix[i,j]))

    df_collab = pd.DataFrame(data = collab_list, columns=['Author_1_2', 'Author_1', 'Author_2', 'number_of_collaborations'])


    df_collab = df_collab.set_index(['Author_1_2'])

    df_authors_ind ['partners'] = ''

    for i in tqdm(range (collab_matrix.shape[0]), desc = 'loop 10'):
        for j in range (collab_matrix.shape[0]):
            if i != j:
                if collab_matrix[i,j] != 0:
                    df_authors_ind ['partners'][i] = str (df_authors_ind ['partners'][i]) + ';' + str (df_authors_ind ['Author_ID'][j])

    df_data_set_ind = pd.DataFrame(data = authors_dep, columns=['Author_1_2', 'Author_1', 'Author_2'])

    df_data_set_ind = df_data_set_ind.set_index(['Author_1_2'])

    df_data_set_ind.insert(2,'TENB',0)
    df_data_set_ind.insert(3,'Cog_Dist', '')
    df_data_set_ind.insert(4,'Geo_Dist',0)
    df_data_set_ind.insert(5,'Diff_Country',0)
    df_data_set_ind.insert(6,'Diff_Continent',0)
    df_data_set_ind.insert(7,'Not_Contig',0)

    # TENB

    res_p = []


    for i in tqdm(range (df_authors_ind.shape[0]), desc = 'loop 11'):
            l = df_authors_ind['partners'][i].split(';')
            l.remove('')
            res_p.append((df_authors_ind['Author_ID'][i],set(l)))

    common_partners_matrix = np.zeros((df_authors_ind.shape[0],df_authors_ind.shape[0]))

    for i in tqdm(range (len(res_p)), desc = 'loop 12'):
        for j in range (len(res_p)):
            common_partners_matrix[i,j] = len(res_p[i][1].intersection(res_p[j][1]))

    list_common_partners = []

    for i in tqdm(range (common_partners_matrix.shape[0]), desc = 'loop 13'):
        for j in range (common_partners_matrix.shape[0]):
            if j > i:
                if common_partners_matrix[i,j] != 0:
                    list_common_partners.append((df_authors_ind['Author_ID'][i] + df_authors_ind['Author_ID'][j], df_authors_ind['Author_ID'][i], df_authors_ind['Author_ID'][j], res_p[i][1].intersection(res_p[j][1])))


    df_common_partners = pd.DataFrame(data = list_common_partners, columns = ['Author_1_2','Author_1','Author_2','Common_partners'])
    df_common_partners.insert(4,'TENB',float)

    number_of_articles = []

    for i in tqdm(range (collab_matrix.shape[0]), desc = 'loop 14'):
        number_of_articles.append((df_authors_ind['Author_ID'][i], collab_matrix[i,i]))

    df_number_of_articles = pd.DataFrame(data = number_of_articles, columns=['Author', 'number_of_articles'])

    df_number_of_articles_ = df_number_of_articles.set_index(['Author'])

    df_collab_ = df_collab.set_index(['Author_1' , 'Author_2'])

    for i in tqdm(range (df_common_partners.shape[0]), desc = 'loop 15'):
        n = len(df_common_partners['Common_partners'][i])
        list_ENB = []
        for j in range (n):
            common_partner = list (df_common_partners['Common_partners'][i])[j]
            d = df_number_of_articles_.loc[common_partner,'number_of_articles']
            num_article_common_partner = int(d)
            Auth_1 = df_common_partners['Author_1'][i]
            try:
                x = df_collab_.loc[(Auth_1,common_partner),'number_of_collaborations']
            except KeyError:
                x = 0
            if x != 0:
                num_collab_Auth_1 = x
            else:
                num_collab_Auth_1 = df_collab_.loc[(common_partner,Auth_1),'number_of_collaborations']
            Auth_2 = df_common_partners['Author_2'][i]
            try:
                y = df_collab_.loc[(Auth_2,common_partner),'number_of_collaborations']
            except KeyError:
                y = 0
            if y != 0:
                num_collab_Auth_2 = y
            else:
                num_collab_Auth_2 = df_collab_.loc[(common_partner,Auth_2),'number_of_collaborations']
            ENB = ((int(num_collab_Auth_1)) * (int(num_collab_Auth_2))) / num_article_common_partner
            list_ENB.append(ENB)
            TENB = sum(list_ENB)
        df_common_partners['TENB'][i] = TENB

    df_common_partners = df_common_partners.set_index(['Author_1_2'])

    df_data_set_ind.TENB = df_common_partners.TENB

    df_authors_ind = df_authors_ind.set_index('Author_ID')

    #Cog_Dist

    def Cog_Dist_ (x, y):
        a1 = df_authors_ind.loc[x, ['topic_1', 'topic_2', 'topic_3', 'topic_4','topic_5', 'topic_6','topic_7', 'topic_8','topic_9']]
        a2 = df_authors_ind.loc[y, ['topic_1', 'topic_2', 'topic_3', 'topic_4','topic_5', 'topic_6','topic_7', 'topic_8','topic_9']]
        a1=a1.tolist()
        a2=a2.tolist()
        if a1 != [0,0,0,0,0,0,0,0,0] and a2 != [0,0,0,0,0,0,0,0,0]:
            cor = numpy.corrcoef(a1, a2)
            return 1 - cor[0][1]

    tqdm.pandas()

    df_data_set_ind.Cog_Dist = df_data_set_ind.progress_apply(lambda x: Cog_Dist_(x.Author_1, x.Author_2), axis = 1)

    # Geo_Dist

    import math

    def Geo_Distance (lat_1,lon_1,lat_2,lon_2):
        R = 6373.0
        lat1 = math.radians(lat_1)
        lon1 = math.radians(lon_1)
        lat2 = math.radians(lat_2)
        lon2 = math.radians(lon_2)
        dlon = lon2 - lon1
        dlat = lat2 - lat1
        a = math.sin(dlat / 2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2)**2
        c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
        distance = R * c
        return distance

    dist = []

    for i in tqdm(df_data_set_ind.index, desc = 'loop 16'):
        author_1 = df_data_set_ind.Author_1[i]
        author_2 = df_data_set_ind.Author_2[i]
        auth_1_lat = df_authors_ind.Latitude[author_1]
        auth_1_lng = df_authors_ind.Longitude[author_1]
        auth_2_lat = df_authors_ind.Latitude[author_2]
        auth_2_lng = df_authors_ind.Longitude[author_2]

        dist.append((author_1, author_2, Geo_Distance(auth_1_lat,auth_1_lng,auth_2_lat,auth_2_lng)))

    df_geo_dist = pd.DataFrame(data = dist, columns=['Author_1', 'Author_2', 'GeoDist'])

    df_geo_dist.insert(3,'Author_1_2','')

    df_geo_dist.Author_1_2 = df_geo_dist.Author_1 + df_geo_dist.Author_2

    df_geo_dist.set_index('Author_1_2', inplace = True)

    df_data_set_ind.Geo_Dist = df_geo_dist.GeoDist

    df_temp = pd.merge(df_data_set_ind, df_authors_ind, left_on = 'Author_1', right_index = True, how = 'left')
    df_temp.rename({'Country_code':'Country_code_1'}, axis = 1, inplace = True)
    df_temp.rename({'Continent':'Continent_1'}, axis = 1, inplace = True)
    df_temp.drop(['EIDs','partners', 'Aff_ID', 'Latitude', 'Longitude', 'Province', 'Province_code'], axis = 1, inplace = True)
    df_temp2 = pd.merge(df_temp, df_authors_ind, left_on = 'Author_2', right_index = True, how = 'left')
    df_temp2.drop(['EIDs','partners', 'Aff_ID', 'Latitude', 'Longitude'], axis = 1, inplace = True)
    df_temp2.rename({'Country_code':'Country_code_2'}, axis = 1, inplace = True)
    df_temp2.rename({'Continent':'Continent_2'}, axis = 1, inplace = True)

    def comparison_(x, y):
        if x == y:
            return 0
        else:
            return 1

    tqdm.pandas()

    df_temp2.Diff_Continent = df_temp2.progress_apply(lambda x: comparison_(x.Continent_1, x.Continent_2), axis = 1)

    df_data_set_ind.Diff_Continent = df_temp2.Diff_Continent

    df_temp2.Diff_Country = df_temp2.progress_apply(lambda x: comparison_(x.Country_code_1, x.Country_code_2), axis = 1)

    df_data_set_ind.Diff_Country = df_temp2.Diff_Country

    list_prov = list(df_authors_ind.Province.unique())

    dic_contig = {'Canada':['United States'],
                  'United States':['Canada'],
                  'Austria' :['Germany', 'Czech Republic', 'Slovakia', 'Poland', 'Hungary', 'Slovenia', 'Croatia', 'Liechtenstein', 'Switzerland'],
                  'Belgium': ['France', 'Germany', 'Luxembourg', 'Netherlands'],
                  'Bulgaria': ['Romania', 'Serbia', 'Macedonia', 'Greece'],
                  'Croatia': ['Bosnia and Herzegovina', 'Hungary', 'Montenegro', 'Serbia', 'Slovenia'],
                  'Cyprus': ['Greece'],
                  'Czech Republic': ['Poland', 'Slovakia ', 'Austria ', 'Germany '],
                  'Denmark': ['Germany', 'Norway', 'Sweden'],
                  'Estonia': ['Russia', 'Latvia'],
                  'Finland': ['Sweden', 'Russia', 'Norway'],
                  'France': ['Belgium', 'Germany', 'Italy', 'Luxembourg', 'Spain', 'Switzerland'],
                  'Germany': ['Netherlands', 'Belgium', 'Luxembourg', 'France'],
                  'Greece': ['Albania', 'Bulgaria', 'Macedonia'],
                  'Hungary': ['Slovakia', 'Romania', 'Serbia', 'Croatia', 'Slovenia', 'Austria', 'Ukraine'],
                  'Ireland': ['United Kingdom'],
                  'Italy': ['Austria', 'France', 'Slovenia', 'Switzerland'],
                  'Latvia': ['Estonia', 'Russia', 'Belarus', 'Lithuania'],
                  'Lithuania': ['Latvia', 'Belarus ', 'Poland'],
                  'Luxembourg': ['Belgium', 'France', 'Germany'],
                  'Malta': [],
                  'Netherlands': ['Germany', 'Belgium'],
                  'Poland': ['Germany', 'Czech Republic', 'Slovakia', 'Ukraine', 'Belarus', 'Lithuania', 'Russia'],
                  'Portugal': ['Spain'],
                  'Romania': ['Ukraine', 'Moldova', 'Bulgaria', 'Serbia', 'Hungary'],
                  'Slovakia': ['Poland', 'Ukraine', 'Hungary', 'Austria', 'Czech Republic'],
                  'Slovenia': ['Austria', 'Italy', 'Hungary', 'Croatia'],
                  'Spain' : ['Portugal', 'France', 'Serbia', 'Albania'],
                  'Sweden': ['Norway', 'Finland', 'Denmark'],
                  'United Kingdom': ['Ireland'],
                  'Switzerland': ['France', 'Germany', 'Austria', 'Liechtenstein', 'Italy'],
                  'Norway': ['Sweden', 'Finland', 'Russia'],
                  'Iceland': [],
                  'Ukraine': ['Belarus', 'Russia ', 'Moldova', 'Romania', 'Hungary', 'Slovakia', 'Poland'],
                  'Serbia': ['Hungary', 'Romania', 'Bulgaria', 'Macedonia', 'Montenegro'],
                  'Macedonia': ['Bulgaria', 'Greece', 'Albania'],
                  'Bosnia and Herzegovina': ['Croatia', 'Serbia', 'Montenegro'],
                  'Albania': ['Montenegro', 'Macedonia', 'Greece'],
                  'Montenegro': ['Croatia', 'Bosnia and Herzegovina', 'Serbia', 'Albania'],
                  'Belarus': ['Poland', 'Lithuania', 'Latvia', 'Russian', 'Ukraine'],
                  'Moldova': ['Ukraine', 'Romania'],
                  'Russia': ['Ukraine', 'Belarus', 'Latvia', 'Estonia', 'Finland', 'Norway', 'Poland']
                 }

    df_temp = pd.merge(df_data_set_ind, df_authors_ind, left_on = 'Author_1', right_index = True, how = 'left')
    df_temp.rename({'Country':'Country_1'}, axis = 1, inplace = True)
    df_temp.drop(['EIDs','partners', 'Aff_ID', 'Latitude', 'Longitude'], axis = 1, inplace = True)
    df_temp2 = pd.merge(df_temp, df_authors_ind, left_on = 'Author_2', right_index = True, how = 'left')
    df_temp2.drop(['EIDs','partners', 'Aff_ID', 'Latitude', 'Longitude'], axis = 1, inplace = True)
    df_temp2.rename({'Country':'Country_2'}, axis = 1, inplace = True)

    def contiguity_ (x, y):
        c = dic_contig[x]
        if c.count(y) == 1:
            return 0
        else:
            return 1

    df_temp2.Not_Contig = df_temp2.progress_apply(lambda x: contiguity_(x.Country_1, x.Country_2), axis = 1)

    df_data_set_ind.Not_Contig = df_temp2.Not_Contig

        # Country dummies

    one_hot = pd.get_dummies(df_temp2[['Country_1','Country_2']])
    df_data_set_ind = df_data_set_ind.join(one_hot)

    df_data_set_ = df_data_set_dep.merge(df_data_set_ind, right_index = True, left_index = True, how = 'left')
    df_data_set = pd.concat([df_data_set,df_data_set_])

df_data_set.reset_index(inplace = True)
df_data_set.drop(['Author_1_y', 'Author_2_y'], axis = 1, inplace = True)
df_data_set.rename({'Author_1_x' : 'Author_1'}, axis = 1, inplace = True)
df_data_set.rename({'Author_2_x' : 'Author_2'}, axis = 1, inplace = True)
df_data_set.fillna(0, inplace = True)

df_data_set['Log_Geo_Dist'] = df_data_set.Geo_Dist.progress_apply(lambda x :math.log1p(x))
df_data_set['Log_TENB'] = df_data_set.TENB.progress_apply(lambda x :math.log1p(x))
df_data_set['Log_Geo_Dist X Log_TENB'] = df_data_set['Log_Geo_Dist'] * df_data_set['Log_TENB']
df_data_set['Log_Geo_Dist_Sq'] = df_data_set['Log_Geo_Dist'] * df_data_set['Log_Geo_Dist']
df_data_set['Log_Geo_Dist_Sq X Log_TENB'] = df_data_set['Log_Geo_Dist_Sq'] * df_data_set['Log_TENB']

df_data_set.to_csv(r'C:\Users\moham\Dropbox\QSE\Thesis\Geopattern\My data\df_data_set_Europe_North_America2a_test.csv')


Enlarging the screen is done!
Libraries were imported successfully!
Loading data from sql is done!


Preparing the data set:   0%|          | 0/7 [00:00<?, ?it/s]

loop 1:   0%|          | 0/49 [00:00<?, ?it/s]

loop 2:   0%|          | 0/49 [00:00<?, ?it/s]

loop 3:   0%|          | 0/49 [00:00<?, ?it/s]

loop 4:   0%|          | 0/49 [00:00<?, ?it/s]

loop 5:   0%|          | 0/49 [00:00<?, ?it/s]

2004 to 2005 : Authors: 49 collaborations: 25


loop 6:   0%|          | 0/49 [00:00<?, ?it/s]

loop 7:   0%|          | 0/49 [00:00<?, ?it/s]

loop 8:   0%|          | 0/49 [00:00<?, ?it/s]

loop 9:   0%|          | 0/49 [00:00<?, ?it/s]

loop 10:   0%|          | 0/49 [00:00<?, ?it/s]

loop 11:   0%|          | 0/49 [00:00<?, ?it/s]

loop 12:   0%|          | 0/49 [00:00<?, ?it/s]

loop 13:   0%|          | 0/49 [00:00<?, ?it/s]

loop 14:   0%|          | 0/49 [00:00<?, ?it/s]

loop 15:   0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/1176 [00:00<?, ?it/s]

loop 16:   0%|          | 0/1176 [00:00<?, ?it/s]

  0%|          | 0/1176 [00:00<?, ?it/s]

  0%|          | 0/1176 [00:00<?, ?it/s]

  0%|          | 0/1176 [00:00<?, ?it/s]

loop 1:   0%|          | 0/124 [00:00<?, ?it/s]

loop 2:   0%|          | 0/124 [00:00<?, ?it/s]

loop 3:   0%|          | 0/124 [00:00<?, ?it/s]

loop 4:   0%|          | 0/124 [00:00<?, ?it/s]

loop 5:   0%|          | 0/124 [00:00<?, ?it/s]

2006 to 2007 : Authors: 124 collaborations: 77


loop 6:   0%|          | 0/124 [00:00<?, ?it/s]

loop 7:   0%|          | 0/124 [00:00<?, ?it/s]

loop 8:   0%|          | 0/124 [00:00<?, ?it/s]

loop 9:   0%|          | 0/124 [00:00<?, ?it/s]

loop 10:   0%|          | 0/124 [00:00<?, ?it/s]

loop 11:   0%|          | 0/124 [00:00<?, ?it/s]

loop 12:   0%|          | 0/124 [00:00<?, ?it/s]

loop 13:   0%|          | 0/124 [00:00<?, ?it/s]

loop 14:   0%|          | 0/124 [00:00<?, ?it/s]

loop 15:   0%|          | 0/98 [00:00<?, ?it/s]

  0%|          | 0/7626 [00:00<?, ?it/s]

loop 16:   0%|          | 0/7626 [00:00<?, ?it/s]

  0%|          | 0/7626 [00:00<?, ?it/s]

  0%|          | 0/7626 [00:00<?, ?it/s]

  0%|          | 0/7626 [00:00<?, ?it/s]

loop 1:   0%|          | 0/210 [00:00<?, ?it/s]

loop 2:   0%|          | 0/210 [00:00<?, ?it/s]

loop 3:   0%|          | 0/210 [00:00<?, ?it/s]

loop 4:   0%|          | 0/210 [00:00<?, ?it/s]

loop 5:   0%|          | 0/210 [00:00<?, ?it/s]

2008 to 2009 : Authors: 210 collaborations: 130


loop 6:   0%|          | 0/210 [00:00<?, ?it/s]

loop 7:   0%|          | 0/210 [00:00<?, ?it/s]

loop 8:   0%|          | 0/210 [00:00<?, ?it/s]

loop 9:   0%|          | 0/210 [00:00<?, ?it/s]

loop 10:   0%|          | 0/210 [00:00<?, ?it/s]

loop 11:   0%|          | 0/210 [00:00<?, ?it/s]

loop 12:   0%|          | 0/210 [00:00<?, ?it/s]

loop 13:   0%|          | 0/210 [00:00<?, ?it/s]

loop 14:   0%|          | 0/210 [00:00<?, ?it/s]

loop 15:   0%|          | 0/179 [00:00<?, ?it/s]

  0%|          | 0/21945 [00:00<?, ?it/s]

loop 16:   0%|          | 0/21945 [00:00<?, ?it/s]

  0%|          | 0/21945 [00:00<?, ?it/s]

  0%|          | 0/21945 [00:00<?, ?it/s]

  0%|          | 0/21945 [00:00<?, ?it/s]

loop 1:   0%|          | 0/254 [00:00<?, ?it/s]

loop 2:   0%|          | 0/254 [00:00<?, ?it/s]

loop 3:   0%|          | 0/254 [00:00<?, ?it/s]

loop 4:   0%|          | 0/254 [00:00<?, ?it/s]

loop 5:   0%|          | 0/254 [00:00<?, ?it/s]

2010 to 2011 : Authors: 254 collaborations: 162


loop 6:   0%|          | 0/254 [00:00<?, ?it/s]

loop 7:   0%|          | 0/254 [00:00<?, ?it/s]

loop 8:   0%|          | 0/254 [00:00<?, ?it/s]

loop 9:   0%|          | 0/254 [00:00<?, ?it/s]

loop 10:   0%|          | 0/254 [00:00<?, ?it/s]

loop 11:   0%|          | 0/254 [00:00<?, ?it/s]

loop 12:   0%|          | 0/254 [00:00<?, ?it/s]

loop 13:   0%|          | 0/254 [00:00<?, ?it/s]

loop 14:   0%|          | 0/254 [00:00<?, ?it/s]

loop 15:   0%|          | 0/177 [00:00<?, ?it/s]

  0%|          | 0/32131 [00:00<?, ?it/s]

loop 16:   0%|          | 0/32131 [00:00<?, ?it/s]

  0%|          | 0/32131 [00:00<?, ?it/s]

  0%|          | 0/32131 [00:00<?, ?it/s]

  0%|          | 0/32131 [00:00<?, ?it/s]

loop 1:   0%|          | 0/342 [00:00<?, ?it/s]

loop 2:   0%|          | 0/342 [00:00<?, ?it/s]

loop 3:   0%|          | 0/342 [00:00<?, ?it/s]

loop 4:   0%|          | 0/342 [00:00<?, ?it/s]

loop 5:   0%|          | 0/342 [00:00<?, ?it/s]

2012 to 2013 : Authors: 342 collaborations: 191


loop 6:   0%|          | 0/342 [00:00<?, ?it/s]

loop 7:   0%|          | 0/342 [00:00<?, ?it/s]

loop 8:   0%|          | 0/342 [00:00<?, ?it/s]

loop 9:   0%|          | 0/342 [00:00<?, ?it/s]

loop 10:   0%|          | 0/342 [00:00<?, ?it/s]

loop 11:   0%|          | 0/342 [00:00<?, ?it/s]

loop 12:   0%|          | 0/342 [00:00<?, ?it/s]

loop 13:   0%|          | 0/342 [00:00<?, ?it/s]

loop 14:   0%|          | 0/342 [00:00<?, ?it/s]

loop 15:   0%|          | 0/318 [00:00<?, ?it/s]

  0%|          | 0/58311 [00:00<?, ?it/s]

loop 16:   0%|          | 0/58311 [00:00<?, ?it/s]