Load the dataset from a CSV file into a Pandas DataFrame.

In [None]:
import pandas as pd

data = pd.read_csv('data.csv')

Remove any rows with missing values for data cleansing.

In [None]:
data.dropna(inplace=True)

Visualize the distribution of text lengths using a histogram.

In [None]:
import matplotlib.pyplot as plt
plt.hist(data['text'].str.len(), bins=30)
plt.show()

Generate and display a word cloud from the text data.

In [None]:
from wordcloud import WordCloud
wordcloud = WordCloud().generate(' '.join(data['text']))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

Split the dataset into training and testing sets.

In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=0.2)

Create TF-IDF variables for the text data.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
X_train = tfidf.fit_transform(train['text'])
X_test = tfidf.transform(test['text'])

Train a Random Forest model using the training data.

In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train, train['label'])

Evaluate the model's accuracy on the test set.

In [None]:
from sklearn.metrics import accuracy_score
y_pred = model.predict(X_test)
accuracy = accuracy_score(test['label'], y_pred)

Calculate and sort feature importance from the trained model.

In [None]:
importances = model.feature_importances_
features = tfidf.get_feature_names_out()
indices = np.argsort(importances)[::-1]