# Data analysis across scientific papers related to the phenotypic plasticity hypothesis

## Phenotypic plasticity hypothesis:  Invasive species are more phenotypically plastic than non-invasive or native ones.

## 1. Import the required libraries

In [3]:
import requests
import datetime
import pandas as pd
import numpy as np
from orkg import ORKG
from bokeh.io import export_png
from bokeh.models import ColumnDataSource,LabelSet, HoverTool, WheelZoomTool, ResetTool, SaveTool, PanTool, DatetimeTickFormatter, Whisker
from bokeh.plotting import figure, show, output_notebook
from bokeh.transform import cumsum
from bokeh.palettes import Category10



import matplotlib.pyplot as plt

from math import pi

import os
import re
output_notebook()

## 2. Connect to ORKG and import phenotypic plasticity hypothesis comparison table

In [None]:
orkg = ORKG(host='https://orkg.org/orkg', simcomp_host='https://orkg.org/orkg/simcomp')

df = orkg.contributions.compare_dataframe(comparison_id='R54244')
df = df.T
df=df.rename(columns={'stand of hypothesis':'stand'})
#df

# 3. Preprocess the data

In [None]:
# put the titles of the papers in a column named "study"
df['study']=df.iloc[:, 0]
df = df.reset_index()
# rename columns
df.rename(columns={' Phenotypic plasticity form': 'Phenotypic plasticity form'}, inplace=True)
df

In [None]:
# a row is duplicated, This command should be deleted when the contribution is deleted in orkg
#df = df.drop(df.index[[26]])
#df = df.reset_index()
#df

### The main hypothesis is divided into 4 sub hypotheses according to the phenotypic plasticity form.

## 4.  Analytical questions

## Question 1: How many research contributions have discussed this hypothesis? How many contributions support, are undecided, or question the hypothesis?

### The main hypothesis

In [None]:
df_stand = df.value_counts(['stand']).reset_index(name='count')
df_stand

In [None]:
# With matplotlib
labels = np.array(df_stand["stand"]).astype(str)
sizes = np.array(df_stand["count"]).astype(int) 
colors = ['#66FF00','#C0C0C0','#FF0033']




fig1, ax1 = plt.subplots(figsize=(9, 7))
#patches, texts, autotexts = 
ax1.pie(sizes, colors = colors, labels=labels, autopct=lambda p:f'{p:.2f}%, {p*sum(sizes)/100 :.0f} ', startangle=90)
# Create a circle for the center of the plot
centre_circle = plt.Circle((0,0),0.70,fc='white')
fig = plt.gcf()
fig.gca().add_artist(centre_circle)
# Equal aspect ratio ensures that pie is drawn as a circle
ax1.axis('equal')  
plt.tight_layout()
plt.text(0, 0,"115", ha='center',fontsize=30)
#plt.title("Phenotypic plasticity Hypothesis",fontsize=20)
#autotexts[0].set_fontsize(15)
#texts[0].set_fontsize(15)
#texts[2].set_fontsize(15)

plt.rcParams.update({'font.size': 15})


plt.savefig("img/main_hyp.png")


plt.show()

## Question 2: How many contributions have discussed each sub hypotheses? How many contributions support, are undecided, or question each sub hypothesis?

### The first sub hypothesis considers morphology as the phenotypic plasticity form.

In [None]:
df_morphology = df[df['Phenotypic plasticity form'].str.contains('Morphology')]
df_morphology_grouped = df_morphology.value_counts(['stand']).reset_index(name='count')
df_morphology_grouped 

In [None]:
labels = np.array(df_morphology_grouped["stand"]).astype(str)
sizes = np.array(df_morphology_grouped["count"]).astype(int) 
colors = ['#66FF00','#C0C0C0','#FF0033']


fig1, ax1 = plt.subplots(figsize=(9, 7))
ax1.pie(sizes, colors = colors, labels=labels, autopct=lambda p:f'{p:.2f}%, {p*sum(sizes)/100 :.0f} ', startangle=90)
# Create a circle for the center of the plot
centre_circle = plt.Circle((0,0),0.70,fc='white')
fig = plt.gcf()
fig.gca().add_artist(centre_circle)
# Equal aspect ratio ensures that pie is drawn as a circle
ax1.axis('equal')  
plt.tight_layout()
plt.text(0, 0,"42", ha='center',fontsize=30)
#plt.title("Morphology",fontsize=20)
plt.rcParams.update({'font.size': 15})

plt.savefig("img/hyp1.png")

plt.show()

### The second sub hypothesis considers physiology as the phenotypic plasticity form.

In [None]:
df_physiology = df[df['Phenotypic plasticity form'].str.contains('Physiology')]
df_physiology_grouped = df_physiology.value_counts(['stand']).reset_index(name='count')
df_physiology_grouped 

In [None]:
labels = np.array(df_physiology_grouped["stand"]).astype(str)
sizes = np.array(df_physiology_grouped["count"]).astype(int) 
colors = ['#66FF00','#C0C0C0','#FF0033']


fig1, ax1 = plt.subplots(figsize=(9, 7))
ax1.pie(sizes, colors = colors, labels=labels, autopct=lambda p:f'{p:.2f}%, {p*sum(sizes)/100 :.0f} ', startangle=90)
# Create a circle for the center of the plot
centre_circle = plt.Circle((0,0),0.70,fc='white')
fig = plt.gcf()
fig.gca().add_artist(centre_circle)
# Equal aspect ratio ensures that pie is drawn as a circle
ax1.axis('equal')  
plt.tight_layout()
plt.text(0, 0,"52", ha='center',fontsize=30)
#plt.title("Physiology",fontsize=20)
plt.rcParams.update({'font.size': 15})

plt.savefig("img/hyp2.png")


plt.show()

### The third sub hypothesis considers life history as the phenotypic plasticity form.

In [None]:
df_life = df[df['Phenotypic plasticity form'].str.contains('Life history')]
df_life_grouped = df_life.value_counts(['stand']).reset_index(name='count')
df_life_grouped 

In [None]:
labels = np.array(df_life_grouped["stand"]).astype(str)
sizes = np.array(df_life_grouped["count"]).astype(int) 
colors = ['#66FF00','#C0C0C0','#FF0033']


fig1, ax1 = plt.subplots(figsize=(9, 7))
ax1.pie(sizes, colors = colors, labels=labels, autopct=lambda p:f'{p:.2f}%, {p*sum(sizes)/100 :.0f} ', startangle=90)
# Create a circle for the center of the plot
centre_circle = plt.Circle((0,0),0.70,fc='white')
fig = plt.gcf()
fig.gca().add_artist(centre_circle)
# Equal aspect ratio ensures that pie is drawn as a circle
ax1.axis('equal')  
plt.tight_layout()
plt.text(0, 0,"67", ha='center',fontsize=30)
#plt.title("Life history",fontsize=20)
plt.rcParams.update({'font.size': 15})

plt.savefig("img/hyp3.png")


plt.show()

### The fourth sub hypothesis considers the behaviour as the phenotypic plasticity form.

In [None]:
df_behaviour = df[df['Phenotypic plasticity form'].str.contains('Behaviour')]
df_behaviour_grouped = df_behaviour.value_counts(['stand']).reset_index(name='count')
df_behaviour_grouped 

In [None]:
labels = np.array(df_behaviour_grouped["stand"]).astype(str)
sizes = np.array(df_behaviour_grouped["count"]).astype(int) 
colors = ['#C0C0C0','#66FF00']


fig1, ax1 = plt.subplots(figsize=(9, 7))
ax1.pie(sizes, colors = colors, labels=labels, autopct=lambda p:f'{p:.2f}%, {p*sum(sizes)/100 :.0f} ', startangle=90)
# Create a circle for the center of the plot
centre_circle = plt.Circle((0,0),0.70,fc='white')
fig = plt.gcf()
fig.gca().add_artist(centre_circle)
# Equal aspect ratio ensures that pie is drawn as a circle
ax1.axis('equal')  
plt.tight_layout()
plt.text(0, 0,"3", ha='center',fontsize=30)
#plt.title("Behaviour",fontsize=20)
plt.rcParams.update({'font.size': 15})

plt.savefig("img/hyp4.png")


plt.show()

## The hierarchy of the hypothesis: Create a graph visualization of the main and sub hypotheses and save it in a pdf file (the pdf file is already created in 'img' folder).

### The number of studies related to each sub hypothesis add up to more than 115 studies because some studies are related to more than one sub hypothesis.

In [None]:
from graphviz import Digraph
g = Digraph('G', filename='img/phentyoic_plasticity.gv')
g.edge('A', 'B')
g.edge('A', 'C')
g.edge('A', 'D')
g.edge('A', 'E')
g.node('A', shape='circle',style='wedged', fillcolor="red;0.3:green;0.6:orange")
g.node('A', shape='none',label="", image="main_hyp.PNG")
g.node('B', shape='none',label="", image="hyp1.PNG")
g.node('C', shape='none',label="", image="hyp2.PNG")
g.node('D', shape='none',label="", image="hyp3.PNG")
g.node('E', shape='none',label="", image="hyp4.PNG")
g.view()

## Show the graph visualizaion here in the notebook

In [None]:
class PDF(object):
  def __init__(self, pdf, size=(200,200)):
    self.pdf = pdf
    self.size = size

  def _repr_html_(self):
    return '<iframe src={0} width={1[0]} height={1[1]}></iframe>'.format(self.pdf, self.size)

  def _repr_latex_(self):
    return r'\includegraphics[width=1.0\textwidth]{{{0}}}'.format(self.pdf)

PDF('img/phentyoic_plasticity.gv.pdf',size=(1000,950))

## To test the hypothesis, some articles investigated plant taxa while others investigated non-plant taxa such as birds, reptiles, etc
### Articles relevant to this hypothesis investgated only one taxon, therefore it is not intersting to plot the number of investigated taxa.

## Question 3: The literature tests the hypotheses by examining plant and non-plant taxa. What exactly are these non-plant taxa? How many articles investigated each taxon?

In [None]:
df_species = df.value_counts(['Investigated species']).reset_index(name='count')
df_species

In [None]:
spec = np.array(df_species['Investigated species'])
count_spec = np.array(df_species['count'])
palette = ['#996666','#ffb3b3','#330000','#867979','#ff00bf','#80ff00','#ff8000','#00ffbf']

In [None]:
df11 = pd.DataFrame(data=dict(species=spec,counts=count_spec,color=palette))

hover1 = HoverTool(
    tooltips=[
        ('Investigated taxon', '@species'),
        ('Number of articles', '@counts')
    ],
        formatters={
        '@species': 'printf',
        '@{counts}' : 'printf'
        }
)

source = ColumnDataSource(df11)
p = figure(x_range= spec,
     plot_height=500, plot_width=970, toolbar_location=None, 
            tools=[hover1,WheelZoomTool(), PanTool(), ResetTool(), SaveTool()],
           x_axis_label='Investigated taxon',
           y_axis_label='Number of articles',
 #         title="Number of studies for each investigated species"
          )

p.vbar(x='species',top='counts', width=0.9,color='color',source=source)
#p.xaxis.major_label_orientation = "vertical"
p.xaxis.axis_label_text_font_size = "20pt"
p.yaxis.axis_label_text_font_size = "20pt"
#p.title.text_font_size = '15pt'

p.xaxis.major_label_orientation = pi/12


# ticks labels font size
p.yaxis.major_label_text_font_size = '11pt'
p.xaxis.major_label_text_font_size = '14pt'

# ticks labels font style
p.yaxis.major_label_text_font_style = 'bold'
p.xaxis.major_label_text_font_style = 'bold'

# axis label font style
p.xaxis.axis_label_text_font_style = 'bold'
p.yaxis.axis_label_text_font_style = 'bold'

show(p)

In [None]:
export_png(p, filename='img/investigated_species.png')

## Question 4: When were the relevant articles published? In other words, what are the most productive years in terms of publishing articles related to this hypothesis?

In [None]:
df_study_date = df.value_counts(['Study date']).reset_index(name='count')
df_study_date

In [None]:
study_date = np.array(df_study_date['Study date'])
count_dates = np.array(df_study_date['count'])
palette = ['#996666','#ffb3b3','#330000','#867979','#ff00bf','#80ff00','#ff8000','#00ffbf','#FFEFD5','#006400','#808000','#00FFFF','#4682B4']

In [None]:
df20 = pd.DataFrame(data=dict(dates=study_date,counts=count_dates,color=palette))

hover1 = HoverTool(
    tooltips=[
        ('Article date', '@dates'),
        ('Number of articles', '@counts')
    ],
        formatters={
        '@dates': 'printf',
        '@{counts}' : 'printf'
        }
)

source = ColumnDataSource(df20)
p = figure(x_range= study_date,
     plot_height=500, plot_width=970, toolbar_location=None, 
            tools=[hover1,WheelZoomTool(), PanTool(), ResetTool(), SaveTool()],
           x_axis_label='Puplication year',
           y_axis_label='Number of articles',
    #      title="Number of studies in each year"
          )

p.vbar(x='dates',top='counts', width=0.9,color='color',source=source)
#p.xaxis.major_label_orientation = "vertical"
p.xaxis.axis_label_text_font_size = "20pt"
p.yaxis.axis_label_text_font_size = "20pt"
p.title.text_font_size = '15pt'


p.xaxis.major_label_orientation = pi/6


# ticks labels font size
p.yaxis.major_label_text_font_size = '11pt'
p.xaxis.major_label_text_font_size = '14pt'

# ticks labels font style
p.yaxis.major_label_text_font_style = 'bold'
p.xaxis.major_label_text_font_style = 'bold'

# axis label font style
p.xaxis.axis_label_text_font_style = 'bold'
p.yaxis.axis_label_text_font_style = 'bold'



show(p)

In [None]:
export_png(p, filename='img/years.png')

## Question 5: Where did the studies related to this hypothesis take place?

In [None]:
df_stand_cont = df.value_counts(['Continent']).reset_index(name='count')
df_stand_cont

In [None]:
cont = np.array(df_stand_cont['Continent'])
count = np.array(df_stand_cont['count'])
palette = ['#8000ff','#996666','#ffb3b3','#330000','#867979','#ff00bf','#80ff00','#ff8000']

In [None]:
df10 = pd.DataFrame(data=dict( continents=cont,counts=count,color=palette))

hover1 = HoverTool(
    tooltips=[
        ('Continent', '@continents'),
        ('Number of studies', '@counts')
    ],
        formatters={
        '@continents': 'printf',
        '@{counts}' : 'printf'
        }
)

source = ColumnDataSource(df10)
p = figure(x_range= cont,
     plot_height=500, plot_width=970, toolbar_location=None, 
            tools=[hover1,WheelZoomTool(), PanTool(), ResetTool(), SaveTool()],
           x_axis_label='Continent',
           y_axis_label='Number of articles',
      #    title="Number of studies in each continent"
          )

p.vbar(x='continents',top='counts', width=0.9,color='color',source=source)
#p.xaxis.major_label_orientation = "vertical"
p.xaxis.axis_label_text_font_size = "20pt"
p.yaxis.axis_label_text_font_size = "20pt"
p.title.text_font_size = '15pt'


p.xaxis.major_label_orientation = pi/12


# ticks labels font size
p.yaxis.major_label_text_font_size = '11pt'
p.xaxis.major_label_text_font_size = '14pt'

# ticks labels font style
p.yaxis.major_label_text_font_style = 'bold'
p.xaxis.major_label_text_font_style = 'bold'

# axis label font style
p.xaxis.axis_label_text_font_style = 'bold'
p.yaxis.axis_label_text_font_style = 'bold'
show(p)




In [None]:
export_png(p, filename='img/continents.png')