In [88]:
import pandas as pd
import numpy as np
import os
import plotly.figure_factory as ff

### Change table style

In [89]:
%%html
<style>
table {float:left}
</style>

### Read the data

* The first dataset contains information about the fighters
* The second dataset contains information bout UFC events

In [90]:
basepath=("./datasets/")
pandas_list=[]

for item in os.listdir(basepath):
    print(item)
    fn=(os.path.join(basepath, item))                     #create a file handler 
    obj=pd.read_excel(fn,encoding="utf-8")                #open file as a pandas dataframe  
    pandas_list.append(obj)

UFC_Social.xlsx
UFC_PPV_Data.xlsx


In [91]:
fighter_stats=pandas_list[0]
event_stats=pandas_list[1]

In [92]:
pandas_list[0].head()

Unnamed: 0,Athlete,Category,Gender,Nation,Rank,Active,Twitter,Instagram,Facebook
0,Daniel Cormier,Heavyweight,M,USA,C,YES,709000,1900000,
1,Stipe Miocic,Heavyweight,M,USA,1,YES,239000,774000,
2,Francis Ngannou,Heavyweight,M,FRA,2,YES,99700,497000,
3,Derrick Lewis,Heavyweight,M,USA,3,YES,148000,1500000,
4,Curtis Blaydes,Heavyweight,M,USA,4,YES,12700,79100,


### Dataset description


| Column        | Descriotion   |
| ------------- |:-------------:|
| Athlete       | Name of the athlete |
| Category      | Weight class  |
| Gender        | Male or Female|
| Nation        | Athlete's country|
| Rank          | Athlets rank within UFC category/weight class|
| Active        | Active or retired |
| Twitter       | Twitter followers|
| Instagram     | Instagram followers|
| Facebook      | Facebook followers|     

### Data analytics general flow

Each project focused on processing and analyzing data has these general steps:
* Understanding the data, data type, and various data formats.
* Data manipulation or data wrangling.
* Getting the insight.
* Providing a solution or an answer to a specific question

If you think about this,it all makes sense. If a subject or topic is clear its easier to provide answers/solution.  
So let's take some time to indroduce a couple of base statistics concepts.

### Data cleanup

Facebook appears to have a lot of NaN values. If the count is high its  a good idea to drop the column.

In [93]:
print(fighter_stats['Facebook'].isnull().count())
fighter_stats.drop(columns=['Facebook'], inplace=True)
fighter_stats.head()

171


Unnamed: 0,Athlete,Category,Gender,Nation,Rank,Active,Twitter,Instagram
0,Daniel Cormier,Heavyweight,M,USA,C,YES,709000,1900000
1,Stipe Miocic,Heavyweight,M,USA,1,YES,239000,774000
2,Francis Ngannou,Heavyweight,M,FRA,2,YES,99700,497000
3,Derrick Lewis,Heavyweight,M,USA,3,YES,148000,1500000
4,Curtis Blaydes,Heavyweight,M,USA,4,YES,12700,79100


In [94]:
cols=['Category','Gender','Nation']
stats=[]
for c in cols:
    print(c,"\n")
    print(fighter_stats[c].describe())
    
    ### Convert pandas series into a DataFrame ###
    data=pd.DataFrame(fighter_stats[c].describe()).reset_index()
    data.rename(columns={'index': 'Statistics'},inplace=True)
    stats.append(data)            

Category 

count              166
unique               9
top       Bantamweight
freq                26
Name: Category, dtype: object
Gender 

count     166
unique      2
top         M
freq      129
Name: Gender, dtype: object
Nation 

count     166
unique     24
top       USA
freq       83
Name: Nation, dtype: object


In [95]:
columns=['Twitter','Instagram']

for c in columns:
    fighter_stats[c].fillna(value=0,inplace=True)

In [96]:
def plot_figure(data):
    import plotly.figure_factory as ff
    fig = ff.create_table(pd.DataFrame(data))
    fig.show()
for elem in stats:
    plot_figure(elem)
    print("\n\n")
















In [97]:
print(fighter_stats['Category'].describe(),"\n")
fighter_stats['Category'].value_counts()

count              166
unique               9
top       Bantamweight
freq                26
Name: Category, dtype: object 



Bantamweight         26
Flyweight            25
Featherweight        21
Heavyweight          20
Light Heavyweight    17
Welterweight         16
Middleweight         16
Lightweight          15
Strawweight          10
Name: Category, dtype: int64

In [98]:
pandas_list[1].head()

Unnamed: 0,Event,Date,Venue,City,State/Province,Country,Attendance,EVENT #,PPV,Explanation,Month,Year,Extra Notes
0,UFC 1: The Beginning,1993-11-12,McNichols Sports Arena,Denver,Colorado,U.S.,7800.0,UFC 1,Not Available,Pre-PPV,November,1993,
1,UFC 2: No Way Out,1994-03-11,Mammoth Gardens,Denver,Colorado,U.S.,2000.0,UFC 2,Not Available,Pre-PPV,March,1994,
2,UFC 3: The American Dream,1994-09-09,Grady Cole Center,Charlotte,North Carolina,U.S.,,UFC 3,Not Available,Pre-PPV,September,1994,
3,UFC 4: Revenge of the Warriors,1994-12-16,Expo Square Pavilion,Tulsa,Oklahoma,U.S.,5857.0,UFC 4,Not Available,Pre-PPV,December,1994,
4,UFC 5: The Return of the Beast,1995-04-07,Independence Arena,Charlotte,North Carolina,U.S.,6000.0,UFC 5,Not Available,Pre-PPV,April,1995,


In [99]:
for c in cols:
    print(c,"\n")
    print(fighter_stats[c].value_counts(),"\n")

Category 

Bantamweight         26
Flyweight            25
Featherweight        21
Heavyweight          20
Light Heavyweight    17
Welterweight         16
Middleweight         16
Lightweight          15
Strawweight          10
Name: Category, dtype: int64 

Gender 

M    129
W     37
Name: Gender, dtype: int64 

Nation 

USA    83
BRA    37
RUS     8
POL     4
AUS     4
MEX     3
GBR     3
NZL     3
SWE     3
NED     2
CAN     2
UKR     2
CUB     1
PER     1
JPN     1
ARG     1
CRO     1
CHN     1
SUI     1
ISL     1
KYR     1
IRL     1
BUL     1
FRA     1
Name: Nation, dtype: int64 



### Cleanup social_stats dataset

**Twitter | Instagram** Columns are formated as strings. I will convert them into integrers and convert **`NaN`** values
into **0**.

Refs:
* https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.fillna.html
* https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.astype.html