# https://www.kaggle.com/bulentsiyah/data-science-and-visualization-exercise
# A few more plots can be added

In [8]:
!pip install chart-studio
!pip install wordcloud

Collecting wordcloud
  Downloading wordcloud-1.8.0-cp36-cp36m-manylinux1_x86_64.whl (365 kB)
[K     |████████████████████████████████| 365 kB 514 kB/s eta 0:00:01
Installing collected packages: wordcloud
Successfully installed wordcloud-1.8.0


In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from chart_studio.plotly import plot
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go

from wordcloud import WordCloud

In [11]:
df = pd.read_csv('timesData.csv')
print(df.info())
print(df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2603 entries, 0 to 2602
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   world_rank              2603 non-null   object 
 1   university_name         2603 non-null   object 
 2   country                 2603 non-null   object 
 3   teaching                2603 non-null   float64
 4   international           2603 non-null   object 
 5   research                2603 non-null   float64
 6   citations               2603 non-null   float64
 7   income                  2603 non-null   object 
 8   total_score             2603 non-null   object 
 9   num_students            2544 non-null   object 
 10  student_staff_ratio     2544 non-null   float64
 11  international_students  2536 non-null   object 
 12  female_male_ratio       2370 non-null   object 
 13  year                    2603 non-null   int64  
dtypes: float64(4), int64(1), object(9)
memor

# Scatter plot and line plots

In [12]:
dat = df.iloc[:20,:]

# Create trace1
trace1 = go.Scatter(
                    x = dat.world_rank,
                    y = dat.citations,
                    mode = 'lines',
                    name = 'citations',
                    marker = dict(color = 'rgba(16,112,2,0.8)'),
                    text = dat.university_name)

# Create trace2
trace2 = go.Scatter(
                    x = dat.world_rank,
                    y = dat.teaching,
                    mode = 'lines+markers',
                    name = 'teaching',
                    marker = dict(color='rgba(80,26,80,0.8)'),
                    text = dat.university_name)
data = [trace1, trace2]
layout = dict(title='Citation and Teaching',
              xaxis=dict(title='world rank',
                         ticklen=5,
                         zeroline=False),
              yaxis= dict(title= 'Citation',
                          ticklen= 5,
                          zeroline= False)
             )
fig = dict(data=data, layout=layout)
iplot(fig)

# Bar plot

In [13]:
df2014 = df[df.year == 2014].iloc[:4,:]

In [18]:
trace1 = go.Bar(
                x = df2014.university_name,
                y = df2014.citations,
                name = 'citations',
                marker = dict(color='rgba(255, 174, 225, 0.5)',
                             line=dict(color='rgb(0,0,0)',
                                      width=1.5)),
                text=df2014.country)

trace2 = go.Bar(
                x = df2014.university_name,
                y = df2014.teaching,
                name= 'teaching',
                marker = dict(color='rgba(255, 255, 128, 0.5)',
                             line=dict(color='rgb(0,0,0)',
                                      width=1.5)),
                text=df2014.country)
data = [trace1, trace2]
layout = go.Layout(barmode='group')
fig = go.Figure(data=data, layout=layout)
iplot(fig)

# Bubble plot

In [21]:
df2016 = df[df.year==2016].iloc[:20,:]
num_students_size = [float(item.replace(',','.')) 
                     for item in df2016.num_students]
international_color = [float(item) for item in df2016.international]

data = [
    {
        'y': df2016.teaching,
        'x': df2016.world_rank,
        'mode': 'markers',
        'marker': {
            'color': international_color,
            'size': num_students_size,
            'showscale': True
        },
        'text': df2016.university_name
    }
]
iplot(data)

# Histogram

In [23]:
x2011 = df.student_staff_ratio[df.year==2011]
x2012 = df.student_staff_ratio[df.year==2012]

trace1 = go.Histogram(
                    x=x2011,
                    opacity=0.75,
                    name='2011',
                    marker=dict(color='rgba(12,50,196,0.6)'))
trace2 = go.Histogram(
                    x=x2012,
                    opacity=0.75,
                    name='2012',
                    marker=dict(color='rgba(171,50,96,0.6)'))
data = [trace1, trace2]
layout = go.Layout(
                    barmode='overlay',
                    title='students staff ratio in 2011 and 2012',
                    xaxis=dict(title='students-staff ratio'),
                    yaxis=dict(title='count'))
fig = go.Figure(data=data, layout=layout)
iplot(fig)

# Box plot

In [26]:
x2015 = df[df.year==2015]

trace0 = go.Box(
                y=x2015.total_score,
                name='total score of universities in 2015',
                marker=dict(
                            color='rgb(12,12,140)'))
trace1 = go.Box(
                y=x2015.research,
                name='research of universities in 2015',
                marker=dict(
                            color='rgb(12,128,128)'))
data = [trace0, trace1]
iplot(data)