#River Python installation


In [1]:
!pip install river

Collecting river
  Downloading river-0.9.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.5 MB)
[K     |████████████████████████████████| 2.5 MB 6.7 MB/s 
Installing collected packages: river
Successfully installed river-0.9.0


In [1]:
pip install -U numpy



In [2]:
import river

In [3]:
dir(river)

['__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '__version__',
 'anomaly',
 'base',
 'cluster',
 'compat',
 'compose',
 'datasets',
 'drift',
 'dummy',
 'ensemble',
 'evaluate',
 'facto',
 'feature_extraction',
 'feature_selection',
 'imblearn',
 'linear_model',
 'metrics',
 'model_selection',
 'multiclass',
 'multioutput',
 'naive_bayes',
 'neighbors',
 'neural_net',
 'optim',
 'preprocessing',
 'proba',
 'reco',
 'rules',
 'stats',
 'stream',
 'synth',
 'time_series',
 'tree',
 'utils']

In [7]:
from river.naive_bayes import MultinomialNB

In [8]:
from river.feature_extraction import BagOfWords,TFIDF 

In [17]:
import pandas as pd

In [18]:
def get_all_attributes(package):
    subpackages = []
    submodules = []
    for i in dir(package):
        if str(i) not in ["__all__", "__builtins__", "__cached__", "__doc__", "__file__", "__loader__", "__name__", "__package__", "__path__", "__pdoc__", "__spec__", "__version__"]:
            subpackages.append(i)
            res = [j for j in dir(eval("river.{}".format(i)))]
            submodules.append(res)
    df = pd.DataFrame(submodules)
    df = df.T
    df.columns = subpackages
    res_df = df.dropna()
    return res_df

In [19]:
river_df = get_all_attributes(river)

#Simulating streaming data

## Training list

In [21]:
data = [("my python program is runnning","software"),
("I tried to run this program, but it has bugs","software"),
("I need a new machine","hardware"),
("the flashdisk is broken","hardware"),
("We need to test our code","software"),
("programming concepts and testing","software"),
("Electrical device","hardware"),
("device drives","hardware"),
("The generator is broken","hardware"),
("im buidling a REST API","software"),
("design the best API so far","software"),
("they need more electrical wiring","hardware"),
("my code has errors","software"),
("i found some program test faulty","software"),
("i broke the car handle","hardware"),
("i tested the user interface code","software")]

## Testing List

In [22]:
test_data = [('he writes programs daily','software'),
             ('my disk is broken','hardware'),
             ("program mantainance","software"),
             ('The drive is full','hardware')]

# Building the pipeline

In [23]:
from river.compose import Pipeline

In [24]:
pipe_nb = Pipeline(('vectorizer',BagOfWords(lowercase=True)),('nb',MultinomialNB()))

# Building our model

##Looping through our dataset

In [26]:
for text,label in data:
    pipe_nb = pipe_nb.learn_one(text,label)

#Making prediction

In [27]:
pipe_nb.predict_one("I built an API")

'software'

In [29]:
pipe_nb.predict_one("the hard drive  in the computer is damaged")

'hardware'

#Prediction probability

In [28]:
pipe_nb.predict_proba_one("I built an API")

{'hardware': 0.37130645555525305, 'software': 0.6286935444447462}

#Model accuracy

In [30]:
metric = river.metrics.Accuracy()
for text,label in test_data:
    y_pred_before = pipe_nb.predict_one(text)
    metric = metric.update(label,y_pred_before)
    pipe_nb = pipe_nb.learn_one(text,label)

In [31]:
metric

Accuracy: 75.00%