Skip to content

Commit

Permalink
Ability to cast a dataframe's column to a different dtype (#11803)
Browse files Browse the repository at this point in the history
Provides the ability to cast columns in dataframes, lazy dataframes, and
expressions.

<img width="587" alt="Screenshot 2024-02-14 at 13 53 01"
src="https://github.com/nushell/nushell/assets/56345/b894f746-0e37-472e-9fb0-eb6f71f2bf27">

<img width="616" alt="Screenshot 2024-02-14 at 13 52 37"
src="https://github.com/nushell/nushell/assets/56345/cf10efa7-d89c-4189-ab71-d368b2354d19">

<img width="626" alt="Screenshot 2024-02-14 at 13 54 58"
src="https://github.com/nushell/nushell/assets/56345/cd57cdf0-5096-41dd-8ab5-46e3d1e061b8">

---------

Co-authored-by: Jack Wright <jack.wright@disqo.com>
  • Loading branch information
ayax79 and ayax79 committed Feb 15, 2024
1 parent cb67de6 commit 525acf9
Show file tree
Hide file tree
Showing 5 changed files with 245 additions and 34 deletions.
207 changes: 207 additions & 0 deletions crates/nu-cmd-dataframe/src/dataframe/eager/cast.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,207 @@
use crate::dataframe::values::{str_to_dtype, NuExpression, NuLazyFrame};

use super::super::values::NuDataFrame;
use nu_engine::CallExt;
use nu_protocol::{
ast::Call,
engine::{Command, EngineState, Stack},
Category, Example, PipelineData, Record, ShellError, Signature, Span, SyntaxShape, Type, Value,
};
use polars::prelude::*;

#[derive(Clone)]
pub struct CastDF;

impl Command for CastDF {
fn name(&self) -> &str {
"dfr cast"
}

fn usage(&self) -> &str {
"Cast a column to a different dtype."
}

fn signature(&self) -> Signature {
Signature::build(self.name())
.input_output_types(vec![
(
Type::Custom("expression".into()),
Type::Custom("expression".into()),
),
(
Type::Custom("dataframe".into()),
Type::Custom("dataframe".into()),
),
])
.required(
"dtype",
SyntaxShape::String,
"The dtype to cast the column to",
)
.optional(
"column",
SyntaxShape::String,
"The column to cast. Required when used with a dataframe.",
)
.category(Category::Custom("dataframe".into()))
}

fn examples(&self) -> Vec<Example> {
vec![
Example {
description: "Cast a column in a dataframe to a different dtype",
example: "[[a b]; [1 2] [3 4]] | dfr into-df | dfr cast u8 a | dfr schema",
result: Some(Value::record(
Record::from_raw_cols_vals_unchecked(
vec!["a".to_string(), "b".to_string()],
vec![
Value::string("u8", Span::test_data()),
Value::string("i64", Span::test_data()),
],
),
Span::test_data(),
)),
},
Example {
description: "Cast a column in a lazy dataframe to a different dtype",
example: "[[a b]; [1 2] [3 4]] | dfr into-df | dfr into-lazy | dfr cast u8 a | dfr schema",
result: Some(Value::record(
Record::from_raw_cols_vals_unchecked(
vec!["a".to_string(), "b".to_string()],
vec![
Value::string("u8", Span::test_data()),
Value::string("i64", Span::test_data()),
],
),
Span::test_data(),
)),
},
Example {
description: "Cast a column in a expression to a different dtype",
example: r#"[[a b]; [1 2] [1 4]] | dfr into-df | dfr group-by a | dfr agg [ (dfr col b | dfr cast u8 | dfr min | dfr as "b_min") ] | dfr schema"#,
result: None
}
]
}

fn run(
&self,
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
let value = input.into_value(call.head);
if NuLazyFrame::can_downcast(&value) {
let (dtype, column_nm) = df_args(engine_state, stack, call)?;
let df = NuLazyFrame::try_from_value(value)?;
command_lazy(call, column_nm, dtype, df)
} else if NuDataFrame::can_downcast(&value) {
let (dtype, column_nm) = df_args(engine_state, stack, call)?;
let df = NuDataFrame::try_from_value(value)?;
command_eager(call, column_nm, dtype, df)
} else {
let dtype: String = call.req(engine_state, stack, 0)?;
let dtype = str_to_dtype(&dtype, call.head)?;

let expr = NuExpression::try_from_value(value)?;
let expr: NuExpression = expr.into_polars().cast(dtype).into();

Ok(PipelineData::Value(
NuExpression::into_value(expr, call.head),
None,
))
}
}
}

fn df_args(
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
) -> Result<(DataType, String), ShellError> {
let dtype = dtype_arg(engine_state, stack, call)?;
let column_nm: String =
call.opt(engine_state, stack, 1)?
.ok_or(ShellError::MissingParameter {
param_name: "column_name".into(),
span: call.head,
})?;
Ok((dtype, column_nm))
}

fn dtype_arg(
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
) -> Result<DataType, ShellError> {
let dtype: String = call.req(engine_state, stack, 0)?;
str_to_dtype(&dtype, call.head)
}

fn command_lazy(
call: &Call,
column_nm: String,
dtype: DataType,
lazy: NuLazyFrame,
) -> Result<PipelineData, ShellError> {
let column = col(&column_nm).cast(dtype);
let lazy = lazy.into_polars().with_columns(&[column]);
let lazy = NuLazyFrame::new(false, lazy);

Ok(PipelineData::Value(
NuLazyFrame::into_value(lazy, call.head)?,
None,
))
}

fn command_eager(
call: &Call,
column_nm: String,
dtype: DataType,
nu_df: NuDataFrame,
) -> Result<PipelineData, ShellError> {
let mut df = nu_df.df;
let column = df
.column(&column_nm)
.map_err(|e| ShellError::GenericError {
error: format!("{e}"),
msg: "".into(),
span: Some(call.head),
help: None,
inner: vec![],
})?;

let casted = column.cast(&dtype).map_err(|e| ShellError::GenericError {
error: format!("{e}"),
msg: "".into(),
span: Some(call.head),
help: None,
inner: vec![],
})?;

let _ = df
.with_column(casted)
.map_err(|e| ShellError::GenericError {
error: format!("{e}"),
msg: "".into(),
span: Some(call.head),
help: None,
inner: vec![],
})?;

let df = NuDataFrame::new(false, df);
Ok(PipelineData::Value(df.into_value(call.head), None))
}

#[cfg(test)]
mod test {

use super::super::super::test_dataframe::test_dataframe;
use super::*;

#[test]
fn test_examples() {
test_dataframe(vec![Box::new(CastDF {})])
}
}
3 changes: 3 additions & 0 deletions crates/nu-cmd-dataframe/src/dataframe/eager/mod.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
mod append;
mod cast;
mod columns;
mod drop;
mod drop_duplicates;
Expand Down Expand Up @@ -35,6 +36,7 @@ use nu_protocol::engine::StateWorkingSet;

pub use self::open::OpenDataFrame;
pub use append::AppendDF;
pub use cast::CastDF;
pub use columns::ColumnsDF;
pub use drop::DropDF;
pub use drop_duplicates::DropDuplicates;
Expand Down Expand Up @@ -78,6 +80,7 @@ pub fn add_eager_decls(working_set: &mut StateWorkingSet) {
// Dataframe commands
bind_command!(
AppendDF,
CastDF,
ColumnsDF,
DataTypes,
Summary,
Expand Down
3 changes: 2 additions & 1 deletion crates/nu-cmd-dataframe/src/dataframe/test_dataframe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use nu_protocol::{
Example, PipelineData, Span,
};

use super::eager::ToDataFrame;
use super::eager::{SchemaDF, ToDataFrame};
use super::expressions::ExprCol;
use super::lazy::{LazyCollect, ToLazyFrame};
use nu_cmd_lang::Let;
Expand Down Expand Up @@ -36,6 +36,7 @@ pub fn build_test_engine_state(cmds: Vec<Box<dyn Command + 'static>>) -> Box<Eng
working_set.add_decl(Box::new(ToLazyFrame));
working_set.add_decl(Box::new(LazyCollect));
working_set.add_decl(Box::new(ExprCol));
working_set.add_decl(Box::new(SchemaDF));

// Adding the command that is being tested to the working set
for cmd in cmds.clone() {
Expand Down
2 changes: 1 addition & 1 deletion crates/nu-cmd-dataframe/src/dataframe/values/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,5 @@ pub use nu_dataframe::{Axis, Column, NuDataFrame};
pub use nu_expression::NuExpression;
pub use nu_lazyframe::NuLazyFrame;
pub use nu_lazygroupby::NuLazyGroupBy;
pub use nu_schema::NuSchema;
pub use nu_schema::{str_to_dtype, NuSchema};
pub use nu_when::NuWhen;
Loading

0 comments on commit 525acf9

Please sign in to comment.