In [None]:
import pandas as pd
import numpy as np
import torch
import random
import seaborn as sns
from sklearn.model_selection import train_test_split
import nltk
from wordcloud import WordCloud
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score



In [None]:
# Check device 
# Get the GPU device name if available.
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('GPU(s) available: {}'.format(torch.cuda.device_count()))
    print('We will use the GPU: {}'.format(torch.cuda.get_device_name(0)))

# If we dont have GPU but a CPU, training will take place on CPU instead
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")
    
torch.cuda.empty_cache()
    
# Set the seed value all over the place to make this reproducible.
seed_val = 41

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [None]:
df=pd.read_csv("data/Dataset_unsupervised.csv")
df.describe()

**Get to Know the Data**

In [None]:
df.groupby('Camera').describe()

In [None]:
sns.countplot(x=df['Camera'])

In [None]:
# Wordcloud of text

# Get stopwords
# Define nltk stopwords in english
nltk.download('stopwords')
stop_words = stopwords.words('english')
# Get a string of all the texts available
data_text = ",".join(txt.lower() for txt in df.Command)

# Create and generate a word cloud image:
wordcloud = WordCloud(max_font_size=50, 
                      max_words=100, 
                      stopwords=stop_words,
                      scale=5,
                      background_color="white").generate(data_text)

# Display the generated image:
plt.figure(figsize=(10,7))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title('Most repeated words in all Commands',fontsize=15)
plt.show()

In [None]:
df_nocam = df[df['Camera']==0]
data_text_nocam = ",".join(txt.lower() for txt in df_nocam.Command)

# Create and generate a word cloud image:
wordcloud_nocam = WordCloud(max_font_size=50, 
                      max_words=100, 
                      stopwords=stop_words,
                      scale=5,
                      background_color="white").generate(data_text_nocam)
plt.figure(figsize=(10,7))
plt.imshow(wordcloud_nocam, interpolation='bilinear')
plt.axis("off")
plt.title('Most repeated words in Commands not requiring a camera',fontsize=15)
plt.show()

In [None]:
df_cam = df[df['Camera']==1]
data_text_cam = ",".join(txt.lower() for txt in df_cam.Command)

# Create and generate a word cloud image:
wordcloud_cam = WordCloud(max_font_size=50, 
                      max_words=100, 
                      stopwords=stop_words,
                      scale=5,
                      background_color="white").generate(data_text_cam)
plt.figure(figsize=(10,7))
plt.imshow(wordcloud_cam, interpolation='bilinear')
plt.axis("off")
plt.title('Most repeated words in Commands requiring a camera',fontsize=15)
plt.show()

**Downsampling data**

In [None]:
df_cam_downsampled = df_cam.sample(df_nocam.shape[0])
df = pd.concat([df_cam_downsampled, df_nocam])
df.groupby('Camera').describe()

In [None]:
# Split test and train data using 25% of the dataset for validation purposes
x_train, x_test, y_train, y_test = train_test_split(df['Command'], df['Camera'], test_size=0.25, shuffle=True, random_state=41)

**Logistic Regression**

In [None]:
# Create a Pipeline with the TfidfVectorizer and LogisticRegression model
LR_pipeline = Pipeline(steps = [('tf', TfidfVectorizer()), 
                                ('lgrg', LogisticRegression())]) # initialize TfidfVectorizer and LogisticRegression


# Create Parameter Grid
pgrid_lgrg = {
 'tf__max_features' : [1000, 2000, 3000],
 'tf__ngram_range' : [(1,1),(1,2)],
 'tf__use_idf' : [True, False],
 'lgrg__penalty' : ['l1', 'l2', 'elasticnet', 'none'],
 'lgrg__class_weight' : ['balanced', None]
}

# Apply GridSearch to Pipeline to find the best parameters
gs_lgrg = GridSearchCV(LR_pipeline, pgrid_lgrg, cv=2, n_jobs=-1, verbose=2)

In [None]:
gs_lgrg.fit(x_train, y_train) # Train LR model

In [None]:
gs_lgrg.best_params_

In [None]:
print('Score of train set', gs_lgrg.score(x_train, y_train))
print('Score of test set',gs_lgrg.score(x_test, y_test))

In [None]:
LR_pred = gs_lgrg.predict(x_test) # Predict on validation data

data = {'true_y': y_test,
       'predicted_y': LR_pred}
df_pred = pd.DataFrame(data, columns=['true_y','predicted_y'])
confusion_matrix = pd.crosstab(df_pred['true_y'], df_pred['predicted_y'], rownames=['True'], colnames=['Predicted'])

sns.heatmap(confusion_matrix, annot=True)
plt.show()

In [None]:
print('Accuracy of LR model', accuracy_score(y_test, LR_pred))

In [None]:
target_names = ['true_y', 'predicted_y']
print(classification_report(y_test, LR_pred, target_names=target_names))

In [None]:
int(round(gs_lgrg.predict(["Follow the drone please"])[0]))

In [None]:
import joblib

# Save the fitted GridSearchCV model
joblib.dump(gs_lgrg, 'log_reg_model.pkl')

In [None]:
loaded_model = joblib.load('log_reg_model.pkl')

# You can now use the loaded model to make predictions
predictions = loaded_model.predict(["fly with my friend wearing the red shirt and black pant"])

In [None]:
gs_lgrg.predict