In [1]:
import pandas as pd
import plotly.express as px
import csv
from itables import init_notebook_mode, show
import scipy.stats as stats
import numpy as np

Requête exécutée sur le serveur : https://dbpedia.org/sparql

PREFIX dbr: <http://dbpedia.org/resource/><br>
PREFIX dbp: <http://dbpedia.org/property/><br>
PREFIX dbo: <http://dbpedia.org/ontology/><br>
PREFIX rdfs:<http://www.w3.org/2000/01/rdf-schema#><br>
SELECT DISTINCT (?o1 AS ?subject_uri) ?target (?name as ?field) ?birthYear<br>
WHERE {<br>
  SELECT DISTINCT ?o1 ?target (str(?label) as ?name) ?birthYear<br>
  WHERE { <br>
    {<br>
          {?o1 ?p dbr:Mathematician.}<br>
    }<br>
    ?o1 a dbo:Person;<br>
      dbp:birthDate | dbo:birthDate ?birthDate;<br>
      dbp:field | dbp:fields | dbo:academicDiscipline | dbp:occupation | dbo:occupation  ?target.<br>
  OPTIONAL{?target rdfs:label ?label.}<br>
    BIND(xsd:integer(SUBSTR(STR(?birthDate), 1, 4)) AS ?birthYear)<br>
    FILTER ( (?birthYear >= 800   && ?birthYear < 2000 )  && LANG(?label) = 'en') <br>
          }<br>
  ORDER BY ?birthYear<br>
  }<br>

In [2]:
import plotly.io as pio
pio.renderers.default = 'iframe' # or 'colab' or 'iframe' or 'iframe_connected' or 'sphinx_gallery'

In [3]:
path = '/Users/maxime/Documents/dossier_python/data/sparql_occupations.csv'

df = pd.read_csv(path)

print(df.shape, df.head(3), '\n\n-----\n', df.tail(3))

(3845, 4)                                   subject_uri  \
0  http://dbpedia.org/resource/Qusta_ibn_Luqa   
1  http://dbpedia.org/resource/Qusta_ibn_Luqa   
2  http://dbpedia.org/resource/Qusta_ibn_Luqa   

                                   target       field  birthYear  
0  http://dbpedia.org/resource/Translator  Translator        820  
1   http://dbpedia.org/resource/Scientist   Scientist        820  
2   http://dbpedia.org/resource/Physician   Physician        820   

-----
                                        subject_uri  \
3842   http://dbpedia.org/resource/Jacob_Tsimerman   
3843  http://dbpedia.org/resource/Michael_Viscardi   
3844    http://dbpedia.org/resource/Azat_Miftakhov   

                                         target          field  birthYear  
3842    http://dbpedia.org/resource/Mathematics    Mathematics       1988  
3843    http://dbpedia.org/resource/Mathematics    Mathematics       1989  
3844  http://dbpedia.org/resource/Mathematician  Mathematician       19

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3845 entries, 0 to 3844
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   subject_uri  3845 non-null   object
 1   target       3845 non-null   object
 2   field        3845 non-null   object
 3   birthYear    3845 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 120.3+ KB


In [5]:
show(df)

subject_uri,target,field,birthYear
Loading... (need help?),,,


In [6]:
occupations = df.groupby(by='field').size().sort_values(ascending=False)
occupations = occupations.reset_index()

In [7]:
occupations.head()

Unnamed: 0,field,0
0,Mathematics,1617
1,Mathematician,457
2,Physics,99
3,Astronomy,68
4,Computer science,45


In [8]:
occupations = occupations.rename(columns={'field':'occupation', 0: 'effectif'})
occupations.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 684 entries, 0 to 683
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   occupation  684 non-null    object
 1   effectif    684 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 10.8+ KB


In [9]:
len(occupations), occupations.describe()

(684,
           effectif
 count   684.000000
 mean      5.621345
 std      64.388700
 min       1.000000
 25%       1.000000
 50%       1.000000
 75%       2.000000
 max    1617.000000)

In [10]:
fig = px.bar(occupations, x='occupation', y='effectif')
fig.show()

In [11]:
occupations.to_csv('/Users/maxime/Documents/dossier_python/data/grouped_occupation.csv')

## Creation de la classe occupation
Dans un premier temps je fais une analyse en enlevant pas le field "Mathematician" ou "Mathematics" s'il n'est pas unique. Je ferai une autre analyse en le faisait.

In [12]:
### Import et inspection des données
occupations_w_classes = pd.read_csv('/Users/maxime/Documents/dossier_python/data/grouped_occupation_with_classes_copie.csv')

In [13]:
occupations_w_classes.info

<bound method DataFrame.info of      Unnamed: 0                  occupation  effectif   classes
0             0                 Mathematics      1617      math
1             1               Mathematician       457      math
2             2                     Physics        99   sci_nat
3             3                   Astronomy        68   sci_nat
4             4            Computer science        45  ing_info
..          ...                         ...       ...       ...
679         679            Fourier analysis         1      math
680         680  Foundations of mathematics         1      math
681         681     Foundations of geometry         1      math
682         682      Formal language theory         1   sci_hum
683         683                     Zoology         1   sci_nat

[684 rows x 4 columns]>

In [14]:
occupations_w_classes.set_index('occupation', inplace=True, verify_integrity=True)
occupations_w_classes.drop(occupations_w_classes.columns[[0,1]], axis=1, inplace=True) 
show(occupations_w_classes)

Unnamed: 0_level_0,classes
occupation,Unnamed: 1_level_1
Loading... (need help?),


In [15]:
dfo = df.merge(occupations_w_classes, how='left', left_on='field', right_index=True)
show(dfo)

subject_uri,target,field,birthYear,classes
Loading... (need help?),,,,


In [16]:
classes = dfo.groupby(by='classes').size().sort_values(ascending=False)
classes = classes.reset_index()
classes

Unnamed: 0,classes,0
0,math,2713
1,sci_nat,470
2,ing_info,199
3,sci_hum,113
4,autre,83
5,philo,66
6,art,41
7,theo,21


In [17]:
classes = classes.rename(columns={ 0: 'effectif'})
classes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   classes   8 non-null      object
 1   effectif  8 non-null      int64 
dtypes: int64(1), object(1)
memory usage: 256.0+ bytes


In [18]:
classes['freq'] = classes['effectif'].apply(lambda x : x / sum(classes['effectif']))
classes.info(), show(classes)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   classes   8 non-null      object 
 1   effectif  8 non-null      int64  
 2   freq      8 non-null      float64
dtypes: float64(1), int64(1), object(1)
memory usage: 320.0+ bytes


classes,effectif,freq
Loading... (need help?),,


(None, None)

In [19]:
fig = px.bar(classes, x='classes', y='effectif')
fig.show()

In [20]:
classes['y'] = 0 
e = list(classes['effectif'])

In [21]:
fig = px.bar(classes,  x='freq', y = 'y', color='classes', height=100,
            orientation = 'h', hover_data=['freq', 'effectif'])
fig.update_layout()
fig.show()

In [22]:
df['birthYear'].min(),df['birthYear'].max()

(820, 1993)

In [23]:
dfo['qcut'] = pd.qcut(dfo['birthYear'], 6  )
type(dfo['qcut']), dfo.head(2)

(pandas.core.series.Series,
                                   subject_uri  \
 0  http://dbpedia.org/resource/Qusta_ibn_Luqa   
 1  http://dbpedia.org/resource/Qusta_ibn_Luqa   
 
                                    target       field  birthYear classes  \
 0  http://dbpedia.org/resource/Translator  Translator        820   autre   
 1   http://dbpedia.org/resource/Scientist   Scientist        820     NaN   
 
                 qcut  
 0  (819.999, 1835.0]  
 1  (819.999, 1835.0]  )

In [24]:
periodes = dfo.groupby(by='qcut').size()
periodes = periodes.reset_index()
periodes

Unnamed: 0,qcut,0
0,"(819.999, 1835.0]",648
1,"(1835.0, 1888.0]",639
2,"(1888.0, 1915.0]",645
3,"(1915.0, 1932.0]",645
4,"(1932.0, 1948.0]",672
5,"(1948.0, 1993.0]",596


In [25]:
bins = [819, 1835, 1888, 1915, 1932, 1948, 1994]

In [26]:
dfo['cut'] = pd.cut(dfo['birthYear'], bins=bins, right=False  )
df.head(2)


Unnamed: 0,subject_uri,target,field,birthYear
0,http://dbpedia.org/resource/Qusta_ibn_Luqa,http://dbpedia.org/resource/Translator,Translator,820
1,http://dbpedia.org/resource/Qusta_ibn_Luqa,http://dbpedia.org/resource/Scientist,Scientist,820


In [27]:
periodes = dfo.groupby(by='cut').size()
periodes = periodes.reset_index()
periodes

Unnamed: 0,cut,0
0,"[819, 1835)",638
1,"[1835, 1888)",616
2,"[1888, 1915)",656
3,"[1915, 1932)",631
4,"[1932, 1948)",661
5,"[1948, 1994)",643


In [28]:
periodes = periodes.rename(columns={ 0: 'effectif'})
periodes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   cut       6 non-null      category
 1   effectif  6 non-null      int64   
dtypes: category(1), int64(1)
memory usage: 450.0 bytes


In [29]:
generations = [819, 1835, 1888, 1915, 1932, 1948, 1994]
dfo['generation'] = pd.cut(dfo['birthYear'], generations, right=False)
# Inspection
dfo.head()

Unnamed: 0,subject_uri,target,field,birthYear,classes,qcut,cut,generation
0,http://dbpedia.org/resource/Qusta_ibn_Luqa,http://dbpedia.org/resource/Translator,Translator,820,autre,"(819.999, 1835.0]","[819, 1835)","[819, 1835)"
1,http://dbpedia.org/resource/Qusta_ibn_Luqa,http://dbpedia.org/resource/Scientist,Scientist,820,,"(819.999, 1835.0]","[819, 1835)","[819, 1835)"
2,http://dbpedia.org/resource/Qusta_ibn_Luqa,http://dbpedia.org/resource/Physician,Physician,820,sci_nat,"(819.999, 1835.0]","[819, 1835)","[819, 1835)"
3,http://dbpedia.org/resource/Maslama_al-Majriti,http://dbpedia.org/resource/Chemist,Chemist,950,sci_nat,"(819.999, 1835.0]","[819, 1835)","[819, 1835)"
4,http://dbpedia.org/resource/Maslama_al-Majriti,http://dbpedia.org/resource/Muslim,Muslim,950,theo,"(819.999, 1835.0]","[819, 1835)","[819, 1835)"


In [30]:
dfo.groupby(by='generation').size().sort_index()

generation
[819, 1835)     638
[1835, 1888)    616
[1888, 1915)    656
[1915, 1932)    631
[1932, 1948)    661
[1948, 1994)    643
dtype: int64

In [31]:
dfo.drop(dfo[['qcut']], axis=1, inplace=True) 

In [32]:
dfo['str_cut'] = dfo['cut'].apply(lambda x : str(int(x.left))+'-'+ str(int(x.right)-1))
dfo['str_cut'][:2]

0    819-1834
1    819-1834
Name: str_cut, dtype: category
Categories (6, object): ['819-1834' < '1835-1887' < '1888-1914' < '1915-1931' < '1932-1947' < '1948-1993']

In [33]:
dfo['gen'] = dfo['generation'].apply(lambda x : str(int(x.left))+'-'+ str(int(x.right)-1))
dfo['gen'][:2]

0    819-1834
1    819-1834
Name: gen, dtype: category
Categories (6, object): ['819-1834' < '1835-1887' < '1888-1914' < '1915-1931' < '1932-1947' < '1948-1993']

In [34]:
show(dfo)

subject_uri,target,field,birthYear,classes,cut,generation,str_cut,gen
Loading... (need help?),,,,,,,,


## Chi2 - Cuts

In [35]:
X = "classes"  # "0"
Y = "str_cut"

dfo_fs = dfo[[Y,X]].pivot_table(index=Y,columns=X,aggfunc=len,margins=True,margins_name="Total").fillna(0).astype(int)
dfo_fs

classes,art,autre,ing_info,math,philo,sci_hum,sci_nat,theo,Total
str_cut,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
819-1834,12,26,15,312,19,25,177,17,603
1835-1887,3,16,12,440,9,17,102,1,600
1888-1914,2,3,30,515,14,13,58,0,635
1915-1931,5,12,37,458,11,23,60,2,608
1932-1947,8,15,48,493,12,24,43,0,643
1948-1993,11,11,57,495,1,11,30,1,617
Total,41,83,199,2713,66,113,470,21,3706


In [36]:
dfo_fs.iat[-1,-1]

3706

In [37]:
tx = dfo_fs.loc[:,["Total"]]
ty = dfo_fs.loc[["Total"],:]
n = dfo_fs.iat[-1,-1] 

### Compute the matrix multiplication between the columns.
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.dot.html
indep = tx.dot(ty) / n
#pd.options.display.float_format = '{0:3.5}'.format

# Non arrondi
show(indep.round(3))

# Arrondi : effectifs théoriques
show(indep.round(0).astype(int))

classes,art,autre,ing_info,math,philo,sci_hum,sci_nat,theo,Total
str_cut,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Loading... (need help?),,,,,,,,,


classes,art,autre,ing_info,math,philo,sci_hum,sci_nat,theo,Total
str_cut,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Loading... (need help?),,,,,,,,,


In [38]:
ecarts = (dfo_fs-indep).iloc[:-1,:-1]
## Attention : arrondi aux entiers dans l'affichage
print(ecarts.round(0).astype(int))

classes    art  autre  ing_info  math  philo  sci_hum  sci_nat  theo
str_cut                                                             
819-1834     5     12       -17  -129      8        7      101    14
1835-1887   -4      3       -20     1     -2       -1       26    -2
1888-1914   -5    -11        -4    50      3       -6      -23    -4
1915-1931   -2     -2         4    13      0        4      -17    -1
1932-1947    1      1        13    22      1        4      -39    -4
1948-1993    4     -3        24    43    -10       -8      -48    -2


In [39]:
tableau = ecarts.iloc[:-1,:-1].round(1)
fig = px.imshow(tableau, text_auto=True, aspect='auto')
fig.show()

In [40]:
ecarts_ponderes = round((dfo_fs-indep)**2/indep,2)
ecarts_ponderes.iloc[:-1,:-1]

classes,art,autre,ing_info,math,philo,sci_hum,sci_nat,theo
str_cut,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
819-1834,4.26,11.56,9.33,37.95,6.36,2.38,132.15,54.0
1835-1887,1.99,0.49,12.69,0.0,0.27,0.09,8.82,1.69
1888-1914,3.59,8.85,0.49,5.41,0.64,2.09,6.3,3.6
1915-1931,0.44,0.19,0.58,0.37,0.0,1.07,3.8,0.61
1932-1947,0.11,0.02,5.26,1.06,0.03,0.98,18.22,3.64
1948-1993,2.55,0.57,17.2,4.16,9.08,3.24,29.75,1.78


In [41]:
chi_2 = ecarts_ponderes.sum().sum()
print(round(chi_2, 2))

419.71


In [42]:
chi2 = stats.chi2_contingency(dfo_fs.iloc[:-1,:-1])

In [43]:
chi2.statistic, chi2.pvalue, chi2.dof

(419.72553167455857, 1.872469195129302e-67, 35)

In [44]:
dfo

Unnamed: 0,subject_uri,target,field,birthYear,classes,cut,generation,str_cut,gen
0,http://dbpedia.org/resource/Qusta_ibn_Luqa,http://dbpedia.org/resource/Translator,Translator,820,autre,"[819, 1835)","[819, 1835)",819-1834,819-1834
1,http://dbpedia.org/resource/Qusta_ibn_Luqa,http://dbpedia.org/resource/Scientist,Scientist,820,,"[819, 1835)","[819, 1835)",819-1834,819-1834
2,http://dbpedia.org/resource/Qusta_ibn_Luqa,http://dbpedia.org/resource/Physician,Physician,820,sci_nat,"[819, 1835)","[819, 1835)",819-1834,819-1834
3,http://dbpedia.org/resource/Maslama_al-Majriti,http://dbpedia.org/resource/Chemist,Chemist,950,sci_nat,"[819, 1835)","[819, 1835)",819-1834,819-1834
4,http://dbpedia.org/resource/Maslama_al-Majriti,http://dbpedia.org/resource/Muslim,Muslim,950,theo,"[819, 1835)","[819, 1835)",819-1834,819-1834
...,...,...,...,...,...,...,...,...,...
3840,http://dbpedia.org/resource/Peter_Scholze,http://dbpedia.org/resource/Algebraic_number_t...,Algebraic number theory,1987,math,"[1948, 1994)","[1948, 1994)",1948-1993,1948-1993
3841,http://dbpedia.org/resource/Marja_Holecyová,http://dbpedia.org/resource/Mathematician,Mathematician,1988,math,"[1948, 1994)","[1948, 1994)",1948-1993,1948-1993
3842,http://dbpedia.org/resource/Jacob_Tsimerman,http://dbpedia.org/resource/Mathematics,Mathematics,1988,math,"[1948, 1994)","[1948, 1994)",1948-1993,1948-1993
3843,http://dbpedia.org/resource/Michael_Viscardi,http://dbpedia.org/resource/Mathematics,Mathematics,1989,math,"[1948, 1994)","[1948, 1994)",1948-1993,1948-1993


## Creation de la classe occupation avec un nettoyage de 'Mathematics' et 'Mathematician'
Puisque toutes les personnes étudiées sont sensées être liés aux mathématiques, j'ai donc enlevé le 'Mathematics' et 'Mathematician' des fields, si ce n'était pas l'activité unique d'une personne

In [45]:
#Je vais maintenant refaire la même analyse, mais en nettoyant mes données différemment.
#Je vais supprimer l'occupation 'Mathematics' et 'Mathematician' si ce ne sont pas les uniques activités.
#Je pourrai alors comparer les resultats
dfo_no_math = dfo

i = 0

for a in dfo_no_math['subject_uri']:
    if i == 0:
        if a == dfo_no_math.loc[i+1, 'subject_uri'] and (dfo_no_math.loc[i, 'field'] == 'Mathematics' or dfo_no_math.loc[i, 'field'] == 'Mathematician'):
            dfo_no_math.loc[i, 'classes'] = None
    if i > 0 and i < dfo_no_math['subject_uri'].size-1:
        if (a == dfo_no_math.loc[i+1, 'subject_uri'] or a == dfo_no_math.loc[i-1, 'subject_uri']) and (dfo_no_math.loc[i, 'field'] == 'Mathematics' or dfo_no_math.loc[i, 'field'] == 'Mathematician'):
            dfo_no_math.loc[i, 'classes'] = None
    if i == dfo_no_math['subject_uri'].size:
        if a == dfo_no_math.loc[i-1, 'subject_uri'] and (dfo_no_math.loc[i, 'field'] == 'Mathematics' or dfo_no_math.loc[i, 'field'] == 'Mathematician'):
            dfo_no_math.loc[i, 'classes'] = None
    
    i = i+1

dfo_no_math


Unnamed: 0,subject_uri,target,field,birthYear,classes,cut,generation,str_cut,gen
0,http://dbpedia.org/resource/Qusta_ibn_Luqa,http://dbpedia.org/resource/Translator,Translator,820,autre,"[819, 1835)","[819, 1835)",819-1834,819-1834
1,http://dbpedia.org/resource/Qusta_ibn_Luqa,http://dbpedia.org/resource/Scientist,Scientist,820,,"[819, 1835)","[819, 1835)",819-1834,819-1834
2,http://dbpedia.org/resource/Qusta_ibn_Luqa,http://dbpedia.org/resource/Physician,Physician,820,sci_nat,"[819, 1835)","[819, 1835)",819-1834,819-1834
3,http://dbpedia.org/resource/Maslama_al-Majriti,http://dbpedia.org/resource/Chemist,Chemist,950,sci_nat,"[819, 1835)","[819, 1835)",819-1834,819-1834
4,http://dbpedia.org/resource/Maslama_al-Majriti,http://dbpedia.org/resource/Muslim,Muslim,950,theo,"[819, 1835)","[819, 1835)",819-1834,819-1834
...,...,...,...,...,...,...,...,...,...
3840,http://dbpedia.org/resource/Peter_Scholze,http://dbpedia.org/resource/Algebraic_number_t...,Algebraic number theory,1987,math,"[1948, 1994)","[1948, 1994)",1948-1993,1948-1993
3841,http://dbpedia.org/resource/Marja_Holecyová,http://dbpedia.org/resource/Mathematician,Mathematician,1988,math,"[1948, 1994)","[1948, 1994)",1948-1993,1948-1993
3842,http://dbpedia.org/resource/Jacob_Tsimerman,http://dbpedia.org/resource/Mathematics,Mathematics,1988,math,"[1948, 1994)","[1948, 1994)",1948-1993,1948-1993
3843,http://dbpedia.org/resource/Michael_Viscardi,http://dbpedia.org/resource/Mathematics,Mathematics,1989,math,"[1948, 1994)","[1948, 1994)",1948-1993,1948-1993


In [46]:
classes = dfo_no_math.groupby(by='classes').size().sort_values(ascending=False)
classes = classes.reset_index()
classes

Unnamed: 0,classes,0
0,math,2369
1,sci_nat,470
2,ing_info,199
3,sci_hum,113
4,autre,83
5,philo,66
6,art,41
7,theo,21


In [47]:
classes = classes.rename(columns={ 0: 'effectif'})
classes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   classes   8 non-null      object
 1   effectif  8 non-null      int64 
dtypes: int64(1), object(1)
memory usage: 256.0+ bytes


In [48]:
classes['freq'] = classes['effectif'].apply(lambda x : x / sum(classes['effectif']))
classes.info(), show(classes)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   classes   8 non-null      object 
 1   effectif  8 non-null      int64  
 2   freq      8 non-null      float64
dtypes: float64(1), int64(1), object(1)
memory usage: 320.0+ bytes


classes,effectif,freq
Loading... (need help?),,


(None, None)

In [49]:
fig = px.bar(classes, x='classes', y='effectif')
fig.show()

In [50]:
classes['y'] = 0 
e = list(classes['effectif'])

In [51]:
fig = px.bar(classes,  x='freq', y = 'y', color='classes', height=100,
            orientation = 'h', hover_data=['freq', 'effectif'])
fig.update_layout()
fig.show()

In [52]:
df['birthYear'].min(),df['birthYear'].max()

(820, 1993)

In [53]:
dfo_no_math['qcut'] = pd.qcut(dfo_no_math['birthYear'], 6  )
type(dfo_no_math['qcut']), dfo_no_math.head(2)

(pandas.core.series.Series,
                                   subject_uri  \
 0  http://dbpedia.org/resource/Qusta_ibn_Luqa   
 1  http://dbpedia.org/resource/Qusta_ibn_Luqa   
 
                                    target       field  birthYear classes  \
 0  http://dbpedia.org/resource/Translator  Translator        820   autre   
 1   http://dbpedia.org/resource/Scientist   Scientist        820     NaN   
 
            cut   generation   str_cut       gen               qcut  
 0  [819, 1835)  [819, 1835)  819-1834  819-1834  (819.999, 1835.0]  
 1  [819, 1835)  [819, 1835)  819-1834  819-1834  (819.999, 1835.0]  )

In [54]:
periodes = dfo_no_math.groupby(by='qcut').size()
periodes = periodes.reset_index()
periodes

Unnamed: 0,qcut,0
0,"(819.999, 1835.0]",648
1,"(1835.0, 1888.0]",639
2,"(1888.0, 1915.0]",645
3,"(1915.0, 1932.0]",645
4,"(1932.0, 1948.0]",672
5,"(1948.0, 1993.0]",596


In [55]:
bins = [819, 1835, 1888, 1915, 1932, 1948, 1994]

In [56]:
dfo_no_math['cut'] = pd.cut(dfo_no_math['birthYear'], bins=bins, right=False  )
df.head(2)

Unnamed: 0,subject_uri,target,field,birthYear
0,http://dbpedia.org/resource/Qusta_ibn_Luqa,http://dbpedia.org/resource/Translator,Translator,820
1,http://dbpedia.org/resource/Qusta_ibn_Luqa,http://dbpedia.org/resource/Scientist,Scientist,820


In [57]:
periodes = dfo_no_math.groupby(by='cut').size()
periodes = periodes.reset_index()
periodes

Unnamed: 0,cut,0
0,"[819, 1835)",638
1,"[1835, 1888)",616
2,"[1888, 1915)",656
3,"[1915, 1932)",631
4,"[1932, 1948)",661
5,"[1948, 1994)",643


In [58]:
periodes = periodes.rename(columns={ 0: 'effectif'})
periodes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   cut       6 non-null      category
 1   effectif  6 non-null      int64   
dtypes: category(1), int64(1)
memory usage: 450.0 bytes


In [59]:
generations = [819, 1835, 1888, 1915, 1932, 1948, 1994]
dfo_no_math['generation'] = pd.cut(dfo_no_math['birthYear'], generations, right=False)
# Inspection
dfo_no_math.head()

Unnamed: 0,subject_uri,target,field,birthYear,classes,cut,generation,str_cut,gen,qcut
0,http://dbpedia.org/resource/Qusta_ibn_Luqa,http://dbpedia.org/resource/Translator,Translator,820,autre,"[819, 1835)","[819, 1835)",819-1834,819-1834,"(819.999, 1835.0]"
1,http://dbpedia.org/resource/Qusta_ibn_Luqa,http://dbpedia.org/resource/Scientist,Scientist,820,,"[819, 1835)","[819, 1835)",819-1834,819-1834,"(819.999, 1835.0]"
2,http://dbpedia.org/resource/Qusta_ibn_Luqa,http://dbpedia.org/resource/Physician,Physician,820,sci_nat,"[819, 1835)","[819, 1835)",819-1834,819-1834,"(819.999, 1835.0]"
3,http://dbpedia.org/resource/Maslama_al-Majriti,http://dbpedia.org/resource/Chemist,Chemist,950,sci_nat,"[819, 1835)","[819, 1835)",819-1834,819-1834,"(819.999, 1835.0]"
4,http://dbpedia.org/resource/Maslama_al-Majriti,http://dbpedia.org/resource/Muslim,Muslim,950,theo,"[819, 1835)","[819, 1835)",819-1834,819-1834,"(819.999, 1835.0]"


In [60]:
dfo_no_math.groupby(by='generation').size().sort_index()

generation
[819, 1835)     638
[1835, 1888)    616
[1888, 1915)    656
[1915, 1932)    631
[1932, 1948)    661
[1948, 1994)    643
dtype: int64

In [61]:
dfo_no_math.drop(dfo_no_math[['qcut']], axis=1, inplace=True) 

In [62]:
dfo_no_math['str_cut'] = dfo_no_math['cut'].apply(lambda x : str(int(x.left))+'-'+ str(int(x.right)-1))
dfo_no_math['str_cut'][:2]

0    819-1834
1    819-1834
Name: str_cut, dtype: category
Categories (6, object): ['819-1834' < '1835-1887' < '1888-1914' < '1915-1931' < '1932-1947' < '1948-1993']

In [63]:
dfo_no_math['gen'] = dfo_no_math['generation'].apply(lambda x : str(int(x.left))+'-'+ str(int(x.right)-1))
dfo_no_math['gen'][:2]

0    819-1834
1    819-1834
Name: gen, dtype: category
Categories (6, object): ['819-1834' < '1835-1887' < '1888-1914' < '1915-1931' < '1932-1947' < '1948-1993']

In [64]:
show(dfo_no_math)

subject_uri,target,field,birthYear,classes,cut,generation,str_cut,gen
Loading... (need help?),,,,,,,,


In [65]:
X = "classes"  # "0"
Y = "str_cut"

dfo_fs = dfo_no_math[[Y,X]].pivot_table(index=Y,columns=X,aggfunc=len,margins=True,margins_name="Total").fillna(0).astype(int)
dfo_fs

classes,art,autre,ing_info,math,philo,sci_hum,sci_nat,theo,Total
str_cut,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
819-1834,12,26,15,175,19,25,177,17,466
1835-1887,3,16,12,385,9,17,102,1,545
1888-1914,2,3,30,466,14,13,58,0,586
1915-1931,5,12,37,425,11,23,60,2,575
1932-1947,8,15,48,460,12,24,43,0,610
1948-1993,11,11,57,458,1,11,30,1,580
Total,41,83,199,2369,66,113,470,21,3362


In [66]:
dfo_fs.iat[-1,-1]

3362

In [67]:
tx = dfo_fs.loc[:,["Total"]]
ty = dfo_fs.loc[["Total"],:]
n = dfo_fs.iat[-1,-1] 

### Compute the matrix multiplication between the columns.
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.dot.html
indep = tx.dot(ty) / n
#pd.options.display.float_format = '{0:3.5}'.format

# Non arrondi
show(indep.round(3))

# Arrondi : effectifs théoriques
show(indep.round(0).astype(int))

classes,art,autre,ing_info,math,philo,sci_hum,sci_nat,theo,Total
str_cut,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Loading... (need help?),,,,,,,,,


classes,art,autre,ing_info,math,philo,sci_hum,sci_nat,theo,Total
str_cut,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Loading... (need help?),,,,,,,,,


In [68]:
ecarts = (dfo_fs-indep).iloc[:-1,:-1]
## Attention : arrondi aux entiers dans l'affichage
print(ecarts.round(0).astype(int))

classes    art  autre  ing_info  math  philo  sci_hum  sci_nat  theo
str_cut                                                             
819-1834     6     14       -13  -153     10        9      112    14
1835-1887   -4      3       -20     1     -2       -1       26    -2
1888-1914   -5    -11        -5    53      2       -7      -24    -4
1915-1931   -2     -2         3    20      0        4      -20    -2
1932-1947    1      0        12    30      0        3      -42    -4
1948-1993    4     -3        23    49    -10       -8      -51    -3


In [69]:
tableau = ecarts.iloc[:-1,:-1].round(1)
fig = px.imshow(tableau, text_auto=True, aspect='auto')
fig.show()

In [70]:
ecarts_ponderes = round((dfo_fs-indep)**2/indep,2)
ecarts_ponderes.iloc[:-1,:-1]

classes,art,autre,ing_info,math,philo,sci_hum,sci_nat,theo
str_cut,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
819-1834,7.02,18.26,5.74,71.63,10.61,5.57,192.05,68.2
1835-1887,2.0,0.48,12.72,0.0,0.27,0.09,8.74,1.7
1888-1914,3.71,9.09,0.63,6.82,0.54,2.28,6.99,3.66
1915-1931,0.58,0.34,0.26,0.97,0.01,0.7,5.17,0.71
1932-1947,0.04,0.0,3.92,2.12,0.0,0.6,20.96,3.81
1948-1993,2.18,0.77,14.97,5.95,9.47,3.7,32.18,1.9


In [71]:
ecarts_ponderes = round((dfo_fs-indep)**2/indep,2)
ecarts_ponderes.iloc[:-1,:-1]

classes,art,autre,ing_info,math,philo,sci_hum,sci_nat,theo
str_cut,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
819-1834,7.02,18.26,5.74,71.63,10.61,5.57,192.05,68.2
1835-1887,2.0,0.48,12.72,0.0,0.27,0.09,8.74,1.7
1888-1914,3.71,9.09,0.63,6.82,0.54,2.28,6.99,3.66
1915-1931,0.58,0.34,0.26,0.97,0.01,0.7,5.17,0.71
1932-1947,0.04,0.0,3.92,2.12,0.0,0.6,20.96,3.81
1948-1993,2.18,0.77,14.97,5.95,9.47,3.7,32.18,1.9


In [72]:
chi2 = stats.chi2_contingency(dfo_fs.iloc[:-1,:-1])

In [73]:
chi2.statistic, chi2.pvalue, chi2.dof

(550.1024355188397, 7.784080639611941e-94, 35)

## Test du V de Cramer

In [74]:
dfo_fs.iloc[:-1,:-1]

classes,art,autre,ing_info,math,philo,sci_hum,sci_nat,theo
str_cut,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
819-1834,12,26,15,175,19,25,177,17
1835-1887,3,16,12,385,9,17,102,1
1888-1914,2,3,30,466,14,13,58,0
1915-1931,5,12,37,425,11,23,60,2
1932-1947,8,15,48,460,12,24,43,0
1948-1993,11,11,57,458,1,11,30,1


In [75]:
X2 = chi2.statistic

N = np.sum(np.array(dfo_fs.iloc[:-1,:-1]))
minimum_dimension = min(dfo_fs.shape)-1
N, X2, minimum_dimension

(3362, 550.1024355188397, 6)

In [76]:
result = np.sqrt((X2/N) / (minimum_dimension-1) )
print(result)

0.18089973353827832


In [77]:
stats.contingency.association(dfo_fs.iloc[:-1,:-1], method='cramer')

0.18089973353827832

In [78]:
table = ecarts_ponderes.iloc[:-1, :-1]/chi_2
table['total'] = table.sum(axis=1)
table.loc['total'] = table.sum(axis=0)
### % plus lisibles
rt = round(table*100,2)
rt

classes,art,autre,ing_info,math,philo,sci_hum,sci_nat,theo,total
str_cut,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
819-1834,1.67,4.35,1.37,17.07,2.53,1.33,45.76,16.25,90.32
1835-1887,0.48,0.11,3.03,0.0,0.06,0.02,2.08,0.41,6.19
1888-1914,0.88,2.17,0.15,1.62,0.13,0.54,1.67,0.87,8.03
1915-1931,0.14,0.08,0.06,0.23,0.0,0.17,1.23,0.17,2.08
1932-1947,0.01,0.0,0.93,0.51,0.0,0.14,4.99,0.91,7.49
1948-1993,0.52,0.18,3.57,1.42,2.26,0.88,7.67,0.45,16.95
total,3.7,6.9,9.11,20.85,4.98,3.08,63.4,19.06,131.07


In [79]:
fig = px.imshow(rt.iloc[:-1, :-1], text_auto=True, aspect='auto')
fig.show()

## Chi2 - Génération

In [80]:
X = "classes"  # "0"
Y = "gen"

dfo_fs = dfo_no_math[[Y,X]].pivot_table(index=Y,columns=X,aggfunc=len,margins=True,margins_name="Total").fillna(0).astype(int)
dfo_fs

classes,art,autre,ing_info,math,philo,sci_hum,sci_nat,theo,Total
gen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
819-1834,12,26,15,175,19,25,177,17,466
1835-1887,3,16,12,385,9,17,102,1,545
1888-1914,2,3,30,466,14,13,58,0,586
1915-1931,5,12,37,425,11,23,60,2,575
1932-1947,8,15,48,460,12,24,43,0,610
1948-1993,11,11,57,458,1,11,30,1,580
Total,41,83,199,2369,66,113,470,21,3362


In [81]:
dfo_fs.iat[-1,-1]

3362

In [82]:
tx = dfo_fs.loc[:,["Total"]]
ty = dfo_fs.loc[["Total"],:]
n = dfo_fs.iat[-1,-1] 

indep = tx.dot(ty) / n

In [83]:
ecarts = (dfo_fs-indep).iloc[:-1,:-1]
print(ecarts.round(0).astype(int))

classes    art  autre  ing_info  math  philo  sci_hum  sci_nat  theo
gen                                                                 
819-1834     6     14       -13  -153     10        9      112    14
1835-1887   -4      3       -20     1     -2       -1       26    -2
1888-1914   -5    -11        -5    53      2       -7      -24    -4
1915-1931   -2     -2         3    20      0        4      -20    -2
1932-1947    1      0        12    30      0        3      -42    -4
1948-1993    4     -3        23    49    -10       -8      -51    -3


In [84]:
ecarts_ponderes = round((dfo_fs-indep)**2/indep,2)
ecarts_ponderes.iloc[:-1,:-1]
chi_2 = ecarts_ponderes.sum().sum()
print(round(chi_2, 2))


550.11


In [85]:
chi2 = stats.chi2_contingency(dfo_fs.iloc[:-1,:-1])

In [86]:
chi2.statistic, chi2.pvalue, chi2.dof

(550.1024355188397, 7.784080639611941e-94, 35)

In [87]:
sh = ecarts_ponderes.iloc[:-1,:-1].shape
print(sh)
v = (sh[0]-1) * (sh[1]-1)
v

(6, 8)


35

In [88]:
(len(ecarts_ponderes)-2) * (len(ecarts_ponderes.columns)-2), len(ecarts_ponderes)-1,len(ecarts_ponderes.columns)-1

(35, 6, 8)

In [89]:
table = ecarts_ponderes/chi_2
table['total'] = table.sum(axis=1)
table.loc['total'] = table.sum(axis=0)
### % plus lisibles
tr = round(table*100,2)
tr

classes,art,autre,ing_info,math,philo,sci_hum,sci_nat,theo,Total,total
gen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
819-1834,1.28,3.32,1.04,13.02,1.93,1.01,34.91,12.4,0.0,68.91
1835-1887,0.36,0.09,2.31,0.0,0.05,0.02,1.59,0.31,0.0,4.73
1888-1914,0.67,1.65,0.11,1.24,0.1,0.41,1.27,0.67,0.0,6.13
1915-1931,0.11,0.06,0.05,0.18,0.0,0.13,0.94,0.13,0.0,1.59
1932-1947,0.01,0.0,0.71,0.39,0.0,0.11,3.81,0.69,0.0,5.72
1948-1993,0.4,0.14,2.72,1.08,1.72,0.67,5.85,0.35,0.0,12.93
Total,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
total,2.82,5.26,6.95,15.9,3.8,2.35,48.37,14.54,0.0,100.0


In [90]:
tableau = tr.iloc[:-1,:-1].round(1)
fig = px.imshow(tableau, text_auto=True, aspect='auto')
fig.show()

In [91]:
tableau = ecarts.iloc[:-1,:-1].round(1)
fig = px.imshow(tableau, text_auto=True, aspect='auto')
fig.show()

In [92]:
dfo_fs.iloc[:-1,:-1]

classes,art,autre,ing_info,math,philo,sci_hum,sci_nat,theo
gen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
819-1834,12,26,15,175,19,25,177,17
1835-1887,3,16,12,385,9,17,102,1
1888-1914,2,3,30,466,14,13,58,0
1915-1931,5,12,37,425,11,23,60,2
1932-1947,8,15,48,460,12,24,43,0
1948-1993,11,11,57,458,1,11,30,1


In [93]:
stats.contingency.association(dfo_fs.iloc[:-1,:-1], method='cramer')

0.18089973353827832