# Pandas:

In [None]:
# Pandas has two core data structures: Series & DataFrame

## Series

In [16]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
counts= Series([2, 4, 6, 8])
print(counts)

0    2
1    4
2    6
3    8
dtype: int64


In [2]:
print(counts.values)

[2 4 6 8]


In [3]:
print(counts.index)

Int64Index([0, 1, 2, 3], dtype='int64')


In [21]:
# We can create customized indexes:
counts= Series([2, 4, 6, 8], index=["a", "b", "c", "d"])
print(counts)

a    2
b    4
c    6
d    8
dtype: int64


In [22]:
# We can use the indexes to access values:
# Note: We need to use quotes around an index:
print(counts["a"])

2


In [23]:
# Reassign:
counts["c"]=22
print(counts)

a     2
b     4
c    22
d     8
dtype: int64


In [24]:
print(counts[["a", "b"]]) # Note the double square brackets

a    2
b    4
dtype: int64


In [25]:
# We can perform operations, similar to Numpy, while preserving the index values
print(np.exp(counts))

a    7.389056e+00
b    5.459815e+01
c    3.584913e+09
d    2.980958e+03
dtype: float64


In [28]:
print(counts*0.3)

a    0.6
b    1.2
c    6.6
d    2.4
dtype: float64


In [29]:
print(counts)

a     2
b     4
c    22
d     8
dtype: int64


In [30]:
# We can create a Series from a Python dictionary:
d={"Alex": 10, "John": 12, "Mary": 13, "Gabi": 15, "Noha": 9,\
   "Juan": 20, "Evan": 20}
grades=Series(d)
print(grades)

Alex    10
Evan    20
Gabi    15
John    12
Juan    20
Mary    13
Noha     9
dtype: int64


In [31]:
print(grades > 12)

Alex    False
Evan     True
Gabi     True
John    False
Juan     True
Mary     True
Noha    False
dtype: bool


In [35]:
raised= grades * 1.2
print(raised)

Alex    12.0
Evan    24.0
Gabi    18.0
John    14.4
Juan    24.0
Mary    15.6
Noha    10.8
dtype: float64


## DataFrame

In [37]:
# The DataFrame is a (possibly heterogeneous) spreadsheet-like (think Excel) data structure
# that enables both row and column indexing. Intutively, we can think about a DataFrame as 
# a dict of Series

data= {"courses": ["Python", "Perl", "Deep Learning", "Pattern Recognition", "Data Mining",\
           "Computational Archives", "Health Informatics"],
       "age": [25, 27, 32, 19, 23, 20, 21],
       "names": ["Alex", "John", "Mary", "Gabi", "Noha", "Juan", "Evan"]}

       
frame=DataFrame(data)
print(frame)

   age                 courses names
0   25                  Python  Alex
1   27                    Perl  John
2   32           Deep Learning  Mary
3   19     Pattern Recognition  Gabi
4   23             Data Mining  Noha
5   20  Computational Archives  Juan
6   21      Health Informatics  Evan


In [39]:
frame=DataFrame(data, index=["s1", "s2", "s3", "s4", "s5", "s6", "s7" ])
print(frame)

    age                 courses names
s1   25                  Python  Alex
s2   27                    Perl  John
s3   32           Deep Learning  Mary
s4   19     Pattern Recognition  Gabi
s5   23             Data Mining  Noha
s6   20  Computational Archives  Juan
s7   21      Health Informatics  Evan


In [40]:
print(frame["names"])

s1    Alex
s2    John
s3    Mary
s4    Gabi
s5    Noha
s6    Juan
s7    Evan
Name: names, dtype: object


In [41]:
print(frame.names)

s1    Alex
s2    John
s3    Mary
s4    Gabi
s5    Noha
s6    Juan
s7    Evan
Name: names, dtype: object


In [42]:
# Rows can be retrieved by e.g., the "ix" indexing field:
print(frame.ix["s1"])

age            25
courses    Python
names        Alex
Name: s1, dtype: object


In [44]:
print(frame.ix["s2"])

age          27
courses    Perl
names      John
Name: s2, dtype: object


In [47]:
# add a coulmn
frame["School"]="UBC"
print(frame)

    age                 courses names School
s1   25                  Python  Alex    UBC
s2   27                    Perl  John    UBC
s3   32           Deep Learning  Mary    UBC
s4   19     Pattern Recognition  Gabi    UBC
s5   23             Data Mining  Noha    UBC
s6   20  Computational Archives  Juan    UBC
s7   21      Health Informatics  Evan    UBC


In [48]:
print(frame.columns)

Index([u'age', u'courses', u'names', u'School'], dtype='object')


In [49]:
print(frame.index)

Index([u's1', u's2', u's3', u's4', u's5', u's6', u's7'], dtype='object')


In [53]:
import statsmodels.api as sm
import pandas as pd
import matplotlib.pyplot as plt
from patsy import dmatrices
from random import shuffle, randint, sample
import seaborn as sns
import numpy as np
%matplotlib inline

emotion = pd.read_csv('emotions_p1_extended_lang_id_noduplic_denoised.csv', delimiter=',', header=0)
#----------------------------------------
print(pd.value_counts(emotion["label"]))

amazement     544395
loathing       74923
admiration     65759
grief          42947
terror         35705
ecstasy        30206
rage            8738
vigilance        695
Name: label, dtype: int64


In [59]:
emotion.head()

Unnamed: 0,message_id,message,label,lang_id
0,596908062054555648,"Last week , Yuki Kawauchi ran 3 HM in 3 consec...",admiration,en
1,257202468386115584,Had a Turkish bath today . #amazing,amazement,en
2,223865330487930880,Taking my 6yo niece shopping #imintrouble #goi...,ecstasy,nl
3,411617825149566976,<USER> <USER> <USER> Britt and I tried for so ...,grief,en
4,267380735453835264,I love this new song of one direction gotta ad...,amazement,en
