In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [3]:
# Turn .csv files into pandas DataFrame's
df = pd.read_csv(
    "train.csv"
)
print(df.head())

   id keyword location                                               text  \
0   1     NaN      NaN  Our Deeds are the Reason of this #earthquake M...   
1   4     NaN      NaN             Forest fire near La Ronge Sask. Canada   
2   5     NaN      NaN  All residents asked to 'shelter in place' are ...   
3   6     NaN      NaN  13,000 people receive #wildfires evacuation or...   
4   7     NaN      NaN  Just got sent this photo from Ruby #Alaska as ...   

   target  
0       1  
1       1  
2       1  
3       1  
4       1  


In [4]:
print(f"Training dataset shape: {df.shape}")

Training dataset shape: (7613, 5)


In [5]:
# Shuffling and dropping unnecessary columns
df_shuffled = df.sample(frac=1, random_state=42)

# Dropping id, keyword and location columns as these columns consists of mostly nan values
# we will be using only text and target columns
df_shuffled.drop(["id", "keyword", "location"], axis=1, inplace=True)
df_shuffled.reset_index(inplace=True, drop=True)
print(df_shuffled.head())

                                                text  target
0  So you have a new weapon that can cause un-ima...       1
1  The f$&amp;@ing things I do for #GISHWHES Just...       0
2  DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...       1
3  Aftershock back to school kick off was great. ...       0
4  in response to trauma Children of Addicts deve...       0


In [6]:
print(df_shuffled.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    7613 non-null   object
 1   target  7613 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 119.1+ KB
None


In [7]:
# Total number of "disaster" and "non-disaster" tweets

print(
    "Total Number of disaster and non-disaster tweets: "
    f"{df_shuffled.target.value_counts()}"
)

Total Number of disaster and non-disaster tweets: 0    4342
1    3271
Name: target, dtype: int64


In [8]:
for index, example in df_shuffled[:2].iterrows():
    print(f"Example #{index}")
    print(f"\tTarget : {example['target']}")
    print(f"\tText : {example['text']}")

Example #0
	Target : 1
	Text : So you have a new weapon that can cause un-imaginable destruction.
Example #1
	Target : 0
	Text : The f$&amp;@ing things I do for #GISHWHES Just got soaked in a deluge going for pads and tampons. Thx @mishacollins @/@


In [9]:
test_df = df_shuffled.sample(frac=0.1, random_state=42)
train_df = df_shuffled.drop(test_df.index)
print(f"Using {len(train_df)} samples for training and {len(test_df)} for validation")

Using 6852 samples for training and 761 for validation


In [10]:
print(train_df["target"].value_counts())

0    3929
1    2923
Name: target, dtype: int64


In [11]:
print(test_df["target"].value_counts())

0    413
1    348
Name: target, dtype: int64


In [12]:
# tf.data.Dataset

def create_dataset(dataframe):
    dataset = tf.data.Dataset.from_tensor_slices(
        (dataframe["text"].to_numpy(), dataframe["target"].to_numpy())
    )
    dataset = dataset.batch(100)
    dataset = dataset.prefetch(tf.data.AUTOTUNE)
    return dataset


train_ds = create_dataset(train_df)
test_ds = create_dataset(test_df)

In [13]:
for x, y in train_ds:
    print(x.shape)
    print(y.shape)
    break

(100,)
(100,)


In [14]:
x[0].numpy()

b'So you have a new weapon that can cause un-imaginable destruction.'

In [15]:
y[0].numpy()

1

In [17]:
train_ds = train_ds.cache().prefetch(buffer_size=10)
test_ds  = test_ds.cache().prefetch(buffer_size=10)