In [None]:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

In [None]:
%matplotlib inline

In [None]:
sns.set()
sns.set_style(style="whitegrid")
sns.set_color_codes("colorblind")

In [None]:
dev = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/DSL/development.csv")
eval = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/DSL/evaluation.csv")

In [None]:
dev["set"] = "dev"
eval["set"] = "test"

# create a combined dataset for easier comparison plots
ds = pd.concat([dev, eval], ignore_index=True)

In [None]:
d = ds["set"] == "dev"
e = ds["set"] == "test"

In [None]:
ds.head(3)

In [None]:
ds["sentiment_lbl"] = ds["sentiment"].apply(lambda n: "positive" if n == 1.0 else "negative")
ds["date"] = ds["date"].apply(lambda s: pd.to_datetime(s.replace(" PDT", "")))
ds["weekday"] = ds["date"].apply(lambda t: t.weekday())
ds["hour"] = ds["date"].apply(lambda t: t.hour)
ds["timestamp"] = ds["date"].apply(lambda t: t.timestamp())

In [None]:
ds.to_csv("/content/drive/MyDrive/Colab Notebooks/DSL/dev_and_eval.csv",
          index=False)

# General Inspection

In [None]:
dev.shape, eval.shape

Null values.

In [None]:
dev.isna().sum(axis=0)

In [None]:
eval.isna().sum(axis=0)

Unique entries.

In [None]:
dev.nunique()

In [None]:
eval.nunique()

In [None]:
ds.corr()

# By feature inspection
## Sentiment

In [None]:
dev["sentiment"].value_counts()

## Ids

In [None]:
dev["ids"].min(), dev["ids"].max()

In [None]:
eval["ids"].min(), eval["ids"].max()

In [None]:
sns.histplot(data=ds, x="ids", hue="set", bins=50, element="step",
             stat="proportion", common_norm=False)
plt.plot()

In [None]:
sns.histplot(data=ds.loc[d], x="ids", hue="sentiment_lbl", bins=50, element="step",
             stat="proportion", common_norm=False)
plt.plot()

## Date

In [None]:
ds.loc[d, "date"].min(), ds.loc[d, "date"].max()

In [None]:
ds.loc[e, "date"].min(), ds.loc[e, "date"].max()

In [None]:
sns.histplot(data=ds, x="date", hue="set", bins=50, element="step",
             stat="proportion", common_norm=False)
plt.plot()

In [None]:
sns.histplot(data=ds.loc[d], x="date", hue="sentiment_lbl", bins=50, element="step",
             stat="proportion", common_norm=False)
plt.plot()

As expected the timestamp and the id have similare distribution, that is likely to be because the ids are assigned sequentially.

In [None]:
sns.histplot(data=ds, x="weekday", hue="set", bins=7, element="step",
             stat="proportion", common_norm=False)
plt.plot()

In [None]:
sns.histplot(data=ds.loc[d], x="weekday", hue="sentiment_lbl", bins=7, element="step",
             stat="proportion", common_norm=False)
plt.plot()

In [None]:
sns.histplot(data=ds, x="hour", hue="set", bins=24, element="step",
             stat="proportion", common_norm=False)
plt.plot()

In [None]:
sns.histplot(data=ds.loc[d], x="hour", hue="sentiment_lbl", bins=7, element="step",
             stat="proportion", common_norm=False)
plt.plot()

## User

In [None]:
train_users = dev["user"].drop_duplicates().sort_values().reset_index()["user"]
test_users = eval["user"].drop_duplicates().sort_values().reset_index()["user"]

train_users.equals(test_users)

In [None]:
dev["user"].value_counts().mean(), dev["user"].value_counts().std()

In [None]:
eval["user"].value_counts().mean(), eval["user"].value_counts().std()