In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('default')

In [None]:
%matplotlib inline
# with this instruction plots will be included in the notebook

In [None]:
gene_data = pd.read_csv('../data/merged_mutations_nt.csv')
gene_data.head(5)

#### Lets look at the coverage plot

In [None]:
gene_data['coverage'].plot()
# Pandas interacts with matplotlib and the default is linechart

#### Histogram of the coverage

In [None]:
gene_data['coverage'].plot.hist(bins=15)

**change the figure size**

In [None]:
gene_data['coverage'].plot.hist(bins=15, figsize=(10,7))

**Some useful parameters to set** 

In [None]:
gene_data['coverage'].plot.hist(
    bins=15, 
    title="coverage plot",
    legend=True,
    fontsize=12,
    colormap='Pastel1',
    grid=True,
    linestyle='--',
    edgecolor='black', 
    linewidth=1.2
);

In [None]:
import matplotlib
matplotlib.rcParams['figure.figsize'] = (10, 7)

In [None]:
gene_data['coverage'].plot(
    title="coverage plot",
    legend=True,
    fontsize=14,
    colormap='Dark2',
    grid=False,
    linestyle='--',
    linewidth=1.2
);

#### What if you want to add more features to this figure. For example a line to show the 6000 coverage. 

In [None]:
ax = gene_data['coverage'].plot(
    title="coverage plot",
    legend=True,
    fontsize=14,
    colormap='Dark2',
    grid=False,
    linestyle='--',
    linewidth=1.2
);
ax.axhline(6000, color='red', linestyle='-');

#### Creating barchart 

Lets plot how many mutations per genes exists.

In [None]:
gene_annotated = pd.read_csv('../data/annotated_DRM.csv')
gene_annotated.head(5)

In [None]:
gene_annotated.tail(5)

In [None]:
gene_annotated[['gene','freq']]

In [None]:
gene_mut_count = gene_annotated[['gene','freq']].groupby('gene').count()
gene_mut_count

**Howmany mutations for each gene?**

In [None]:
gene_mut_count.plot.bar()

#### Lets compare mutant sequence with wild type sequence.

In [None]:
wt_mut_data = pd.read_excel('../data/intermediate1.xlsx')

In [None]:
wt_mut_data = wt_mut_data[wt_mut_data['freq']<1]
wt_mut_data.head(5)

**Histogram of the frequencies of mutations** 

In [None]:
wt_mut_data['freq'].plot.hist()

**Scatter plot of non-synonymous substitutions**

In [None]:
len(wt_mut_data[wt_mut_data['wt']==wt_mut_data['mut']])

In [None]:
len(wt_mut_data[wt_mut_data['wt']!=wt_mut_data['mut']])

In [None]:
nonsyn_mut = wt_mut_data[wt_mut_data['wt']!=wt_mut_data['mut']]
nonsyn_mut.head(5)

In [None]:
nonsyn_mut.plot(kind='scatter', 
                x='pos', y='freq', 
                fontsize=14,
                color='red',
                marker='o')

### Exercise

1. Load annotated_DRM_exe1.xlsx file. 

In [None]:
# write the code here

2. Plot number of mutations per genes for the sample. which gene has highest number of mutations?

In [None]:
# write the code here

3. Plot coverage plot for mutated based in RT gene. Do all mutated based have more than 1000 coverage? Can you show this on the plot with a horizantal line?

In [None]:
# write the code here